diff --git "a/content/zh/post/SirLYX/DB4AI-\346\267\273\345\212\240\346\226\260\346\234\272\345\231\250\345\255\246\344\271\240\347\256\227\346\263\225.md" "b/content/zh/post/SirLYX/DB4AI-\346\267\273\345\212\240\346\226\260\346\234\272\345\231\250\345\255\246\344\271\240\347\256\227\346\263\225.md" new file mode 100644 index 0000000000000000000000000000000000000000..3213af50e8cae7d30c70a32a493748d3b53e16ae --- /dev/null +++ "b/content/zh/post/SirLYX/DB4AI-\346\267\273\345\212\240\346\226\260\346\234\272\345\231\250\345\255\246\344\271\240\347\256\227\346\263\225.md" @@ -0,0 +1,163 @@ +# **DB4AI新算法添加** +## **前提说明** +**文档前提** + +数据库版本:openGauss 3.1.0 + +源码下载链接:https://gitee.com/opengauss/openGauss-server/tree/3.1.0/ + +**文档目标** + +基于逻辑回归算法说明添加新算法的主要过程,新算法名为new_logregr,添加new前缀以区分openGauss已有的logregr,使用公共超参数learning_rate和max_iterations,同时为了添加并验证自定义超参数,会进行自定义整数型超参数new_hyperp的添加。 + +本文档仅基于逻辑回归算法进行添加,主要说明添加算法的主要过程,不同算法依据需要在细节处进行针对性源码修改。 + +## **算法逻辑** + + + **逻辑回归算法:** + https://huaweicloud.csdn.net/637f7c81dacf622b8df85f47.html + + +## **添加步骤** + +**步骤一** + +在源码路径src/gausskernel/dbmind/db4ai/executor/algorithms文件夹下新建文件**new_logregr.cpp**,并引入头文件db4ai/gd.h。 + +![](image/step1.png) + +**步骤二** + +在new_logregr.cpp文件中添加梯度计算函数、损失计算函数、推理函数(依据具体算法需自行编写梯度计算、损失计算、推理等过程),依次添加逻辑回归函数的梯度计算函数new_logreg_gradients、损失计算函数new_logreg_test、推理函数new_logreg_predict。 + +首先添加梯度计算函数new_logreg_gradients,梯度计算函数在每一batch被调用,用以计算出本次batch的梯度,而后会将梯度更新到权重里,依据不同的机器学习算法自行实现梯度计算。逻辑回归梯度下降算法具体实现如下图所示。 + +![](image/step2.png) + +函数参数中cfg结构体组成如下图,包含了超参数hyperp、样本features、权重矩阵weights、目标样本值dep_var(真实值)等信息,基于这些信息进行梯度计算。 + +![](image/step2.1.png) + +而后添加损失计算函数new_logreg_test,如下图所示,此函数主要是用于计算当前的loss,返回的loss用于判断是否已经低于设定的阈值(tolerance),低于阈值则结束训练。 + +![](image/step2.2.png) + +最后添加推理函数,推理时调用推理函数利用训练好的权重计算结果并将结果返回,不同算法返回类型需依据算法进行确定,如下图所示。 + +![](image/step2.3.png) + +**步骤三(添加超参数new_hyperp)** + +在源码src/include/db4ai/gd.h文件的HyperparametersGD结构中,添加int new_hyperp,如下图。 + +![](image/step3.1.png) + +并在同一个文件中GD_HYPERPARAMETERS_SUPERVISED处添加该超参数new_hyperp的默认值等限定信息,如下图。GD_HYPERPARAMETERS_SUPERVISED进行超参数初始化时使用。 + +![](image/step3.2.png) + +**步骤四(添加超参数初始化验证函数)** + +在new_logregr.cpp文件中添加new_regression_hyperparameter_definitions,其中GD_HYPERPARAMETERS_SUPERVISED主要是设置超参数的默认值等信息,而后添加用于获取超参数定义的函数new_gd_get_hyperparameters_regression,该函数主要是在训练前获取超参数进行初始化设置、验证等前期准备工作,如下图。 + +![](image/step4.1.png) + +最后在src/gausskernel/dbmind/db4ai/executor/optimize/gd/gd.cpp中的gd_run函数中添加Assert函数来验证超参数new_hyperp是否是指定值。 + +![](image/step4.2.png) + +**步骤五(填充算法对应的GradientDescent结构体,该结构体会在训练过程中充当算法的代表,提供训练、计算梯度、损失计算等函数的调用)** + +GradientDescent结构体的结构,如下图。 + +![](image/step5.1.png) + +在new_logregr.cpp中添加该算法对应的GradientDescent结构体new_gd_logistic_regression,如下图。 + +![](image/step5.2.png) + +AlgorithomAPI中NEW_LOGISTIC_REGRESSION为枚举类型,需要在src/include/db4ai/db4ai.h头文件的AlgorithmML中添加,如下图。后续会通过此项获取对应算法的GradientDescent结构,同时注意NEW_LOGISTIC_REGRESSION在AlgorithmML中位置顺序。 + +![](image/step5.3.png) + +在src/include/db4ai/gd.h中进行extern 外部变量声明如下图。 + +![](image/step5.4.png) + +在src/gausskernel/dbmind/db4ai/catalog/aifuncs.cpp文件中添加&new_gd_logistic_regression.algo,其所在的位置必须与NEW_LOGISTIC_REGRESSION在AlgorithmML中位置顺序一样,因为会根据对应顺利进行遍历检索,如下图。 + +![](image/step5.5.png) + +在src/gausskernel/dbmind/db4ai/executor/optimize/gd/gd.cpp中的gd_get_algorithm函数中做如下图添加,该函数通过枚举NEW_LOGISTIC_REGRESSION获得对应的GradientDescent结构体new_gd_logistic_regression。 + +![](image/step5.6.png) + +在src/gausskernel/dbmind/db4ai/catalog/model_warehouse.cpp中做如下图添加。 + +![](image/step5.7.png) + +修改src/gausskernel/dbmind/db4ai/commands/create_model.cpp,添加new_logistic_regression,如下图。 + +![](image/step5.8.png) + +**步骤六(编译)** + +在src/gausskernel/dbmind/db4ai/executor/algorithms/CMakeLists.txt中添加算法文件路径,如下图。 + +![](image/step6.1.png) + +在src/gausskernel/dbmind/db4ai/executor/algorithms/Makefile中添加new_logregr.o,如下图。 + +![](image/step6.2.png) + +## **编译验证** + +主要验证编译后的new_logregr算法能否正常运行。 + +**编译** + +参考相应的编译文档。 + +**数据库数据准备** + +``` +CREATE TABLE houses (id INTEGER, tax INTEGER, bedroom INTEGER, bath DOUBLE PRECISION, price INTEGER, size INTEGER, lot INTEGER, mark text); +``` + +``` +insert into houses(id, tax, bedroom, bath, price, size, lot, mark) VALUES +(1,590,2,1,50000,770,22100,'a+'), +(2,1050,3,2,85000,1410,12000,'a+'), +(3,20,2,1,22500,1060,3500,'a-'), +(4,870,2,2,90000,1300,17500,'a+'), +(5,1320,3,2,133000,1500,30000,'a+'), +(6,1350,2,1,90500,850,25700,'a-'), +(7,2790,3,2.5,260000,2130,25000,'a+'), +(8,680,2,1,142500,1170,22000,'a-'), +(9,1840,3,2,160000,1500,19000,'a+'), +(10,3680,4,2,240000,2790,20000,'a-'), +(11,1660,3,1,87000,1030,17500,'a+'), +(12,1620,3,2,118500,1250,20000,'a-'), +(13,3100,3,2,140000,1760,38000,'a+'), +(14,2090,2,3,148000,1550,14000,'a-'), +(15,650,3,1.5,65000,1450,12000,'a-'); +``` + +**模型训练与验证** + +``` +CREATE MODEL new_price_model USING new_logistic_regression FEATURES size, lot TARGET mark FROM HOUSES WITH learning_rate=0.88, new_hyperp=1, max_iterations=default; +``` + +执行后返回成功信息,证明模型已训练完成且new_hyperp参数值已经被正确设置为1,并在Assert函数中被获取检验。 + +![](image/verification.png) + +![](image/verification1.png) + +![](image/verification2.png) + +![](image/verification3.png) + + diff --git a/content/zh/post/SirLYX/image/step1.png b/content/zh/post/SirLYX/image/step1.png new file mode 100644 index 0000000000000000000000000000000000000000..ac61183328ccb7a3fc5d647c683df0c1b4921afb Binary files /dev/null and b/content/zh/post/SirLYX/image/step1.png differ diff --git a/content/zh/post/SirLYX/image/step2.1.png b/content/zh/post/SirLYX/image/step2.1.png new file mode 100644 index 0000000000000000000000000000000000000000..8ade4938e4cfa2b77fdc14f1e6fac5741eb338e3 Binary files /dev/null and b/content/zh/post/SirLYX/image/step2.1.png differ diff --git a/content/zh/post/SirLYX/image/step2.2.png b/content/zh/post/SirLYX/image/step2.2.png new file mode 100644 index 0000000000000000000000000000000000000000..9c5105413d34ff9b34be0d48f881c2467c9799c7 Binary files /dev/null and b/content/zh/post/SirLYX/image/step2.2.png differ diff --git a/content/zh/post/SirLYX/image/step2.3.png b/content/zh/post/SirLYX/image/step2.3.png new file mode 100644 index 0000000000000000000000000000000000000000..1384860f02603eb1f168c71d32a26b8e04872793 Binary files /dev/null and b/content/zh/post/SirLYX/image/step2.3.png differ diff --git a/content/zh/post/SirLYX/image/step2.png b/content/zh/post/SirLYX/image/step2.png new file mode 100644 index 0000000000000000000000000000000000000000..ba3ca52fcf0848e98824c9841a473184c22ff7a9 Binary files /dev/null and b/content/zh/post/SirLYX/image/step2.png differ diff --git a/content/zh/post/SirLYX/image/step3.1.png b/content/zh/post/SirLYX/image/step3.1.png new file mode 100644 index 0000000000000000000000000000000000000000..9412e44bd5f36d38ff3cc08473b1bc3a4a750882 Binary files /dev/null and b/content/zh/post/SirLYX/image/step3.1.png differ diff --git a/content/zh/post/SirLYX/image/step3.2.png b/content/zh/post/SirLYX/image/step3.2.png new file mode 100644 index 0000000000000000000000000000000000000000..fb9a79686878248c875df2a47cd7e01fa75f7688 Binary files /dev/null and b/content/zh/post/SirLYX/image/step3.2.png differ diff --git a/content/zh/post/SirLYX/image/step4.1.png b/content/zh/post/SirLYX/image/step4.1.png new file mode 100644 index 0000000000000000000000000000000000000000..e28c64b201a72fac91d5459f27b7145862ffc4a7 Binary files /dev/null and b/content/zh/post/SirLYX/image/step4.1.png differ diff --git a/content/zh/post/SirLYX/image/step4.2.png b/content/zh/post/SirLYX/image/step4.2.png new file mode 100644 index 0000000000000000000000000000000000000000..ab8f610295246157e5081aca271897f7d0081b35 Binary files /dev/null and b/content/zh/post/SirLYX/image/step4.2.png differ diff --git a/content/zh/post/SirLYX/image/step5.1.png b/content/zh/post/SirLYX/image/step5.1.png new file mode 100644 index 0000000000000000000000000000000000000000..39ec21117e6f674569070379edbddbde1ab6841e Binary files /dev/null and b/content/zh/post/SirLYX/image/step5.1.png differ diff --git a/content/zh/post/SirLYX/image/step5.2.png b/content/zh/post/SirLYX/image/step5.2.png new file mode 100644 index 0000000000000000000000000000000000000000..0e85080a650df08066968999296e365725b734f7 Binary files /dev/null and b/content/zh/post/SirLYX/image/step5.2.png differ diff --git a/content/zh/post/SirLYX/image/step5.3.png b/content/zh/post/SirLYX/image/step5.3.png new file mode 100644 index 0000000000000000000000000000000000000000..a4700f7054659fc9103935b4316f5517d2798459 Binary files /dev/null and b/content/zh/post/SirLYX/image/step5.3.png differ diff --git a/content/zh/post/SirLYX/image/step5.4.png b/content/zh/post/SirLYX/image/step5.4.png new file mode 100644 index 0000000000000000000000000000000000000000..57e263461adf3c726286a790041b1966b01ad0aa Binary files /dev/null and b/content/zh/post/SirLYX/image/step5.4.png differ diff --git a/content/zh/post/SirLYX/image/step5.5.png b/content/zh/post/SirLYX/image/step5.5.png new file mode 100644 index 0000000000000000000000000000000000000000..2db238aadf6089a88e2c881f84ebfb5d775fec4b Binary files /dev/null and b/content/zh/post/SirLYX/image/step5.5.png differ diff --git a/content/zh/post/SirLYX/image/step5.6.png b/content/zh/post/SirLYX/image/step5.6.png new file mode 100644 index 0000000000000000000000000000000000000000..eb70ba4deec37c9e38fa2b559ef7fa9f4e07d6f8 Binary files /dev/null and b/content/zh/post/SirLYX/image/step5.6.png differ diff --git a/content/zh/post/SirLYX/image/step5.7.png b/content/zh/post/SirLYX/image/step5.7.png new file mode 100644 index 0000000000000000000000000000000000000000..3936e8d6735013aca8c0a79ac30c2c5c88e1d464 Binary files /dev/null and b/content/zh/post/SirLYX/image/step5.7.png differ diff --git a/content/zh/post/SirLYX/image/step5.8.png b/content/zh/post/SirLYX/image/step5.8.png new file mode 100644 index 0000000000000000000000000000000000000000..401b4e0b8c2f04a16da4ac30645ce8da006e3eaf Binary files /dev/null and b/content/zh/post/SirLYX/image/step5.8.png differ diff --git a/content/zh/post/SirLYX/image/step6.1.png b/content/zh/post/SirLYX/image/step6.1.png new file mode 100644 index 0000000000000000000000000000000000000000..79a498f071fda034aa2ca82fc8dd33f9693431e2 Binary files /dev/null and b/content/zh/post/SirLYX/image/step6.1.png differ diff --git a/content/zh/post/SirLYX/image/step6.2.png b/content/zh/post/SirLYX/image/step6.2.png new file mode 100644 index 0000000000000000000000000000000000000000..3c9707c2a5293272bf4d285b7e3c73cc2196b031 Binary files /dev/null and b/content/zh/post/SirLYX/image/step6.2.png differ diff --git a/content/zh/post/SirLYX/image/verification.png b/content/zh/post/SirLYX/image/verification.png new file mode 100644 index 0000000000000000000000000000000000000000..9b6711e395daaa6f7ef647a7a777f8411bbf6c1b Binary files /dev/null and b/content/zh/post/SirLYX/image/verification.png differ diff --git a/content/zh/post/SirLYX/image/verification1.png b/content/zh/post/SirLYX/image/verification1.png new file mode 100644 index 0000000000000000000000000000000000000000..7b0a2ea6e8a722a69f04b687164c670c97a9a08f Binary files /dev/null and b/content/zh/post/SirLYX/image/verification1.png differ diff --git a/content/zh/post/SirLYX/image/verification2.png b/content/zh/post/SirLYX/image/verification2.png new file mode 100644 index 0000000000000000000000000000000000000000..5258d79333001f8541cb4c06f631a8362783a9ab Binary files /dev/null and b/content/zh/post/SirLYX/image/verification2.png differ diff --git a/content/zh/post/SirLYX/image/verification3.png b/content/zh/post/SirLYX/image/verification3.png new file mode 100644 index 0000000000000000000000000000000000000000..b002557215f66d677146021c823fa225dd121a51 Binary files /dev/null and b/content/zh/post/SirLYX/image/verification3.png differ