# 2021华为杯数学建模B题 **Repository Path**: way1024/huaweicup ## Basic Information - **Project Name**: 2021华为杯数学建模B题 - **Description**: 2021年华为杯数学建模B题 - **Primary Language**: R - **License**: Not specified - **Default Branch**: master - **Homepage**: None - **GVP Project**: No ## Statistics - **Stars**: 1 - **Forks**: 0 - **Created**: 2021-10-15 - **Last Updated**: 2024-01-02 ## Categories & Tags **Categories**: Uncategorized **Tags**: None ## README # 第一题 读入数据,并提取出"2020/8/25-2020/8/28"所在的行: ```{R} # 读取数据 table = read.table("E://2022数学建模//1_临界值.csv",header = T, sep = ',',encoding = 'UTF-8') A_data = read.table("E://2022数学建模//1_B_ORIGINAL.csv", header =T, sep =",",encoding = 'UTF-8') # 并提取出“2020/8/25-2020/8/28”所在的行 index = c(which(A_data[,1]=="2020/8/25"), which(A_data[,1]=="2020/8/26"), which(A_data[,1]=="2020/8/27"), which(A_data[,1]=="2020/8/28")) # 提取出“2020/8/25-2020/8/28”四日的污染物浓度 list(print(index),as.matrix(A_data[index,3:8])) ``` 计算四日的IAQI ```{R} ### 计算四日的IAQI IAQI=matrix(NA,4,6) `colnames<-`(IAQI,c('SO2','NO2','PM10','PM2.5','O3','CO')) for(zz in (1:4)){ for (jj in (1:6)) { for ( ii in (2:8)){ if(table[(jj+1),(ii+1)]>A_data[(497+zz),(jj+2)] & A_data[(497+zz),(jj+2)]>table[(jj+1),ii]){ BP_hi=table[(jj+1),(ii+1)] BP_lo=table[(jj+1),(ii)] IAQI_hi=table[1,(ii+1)] IAQI_lo=table[1,ii] break }else{ BP_hi=0 BP_lo=0 IAQI_hi=0 IAQI_lo=0 } } IAQI[zz,jj]=(IAQI_hi-IAQI_lo)/(BP_hi-BP_lo)*(A_data[(497+zz),(jj+2)]-BP_lo)+IAQI_lo } } ## 每行表示每一日的IAQI值 `colnames<-`(IAQI,c('SO2','NO2','PM10','PM2.5','O3','CO')) days = c("2020/08/25","2020/08/26","2020/08/27","2020/08/28") # 每一行取最大值 for (i in (1:4)) { print(paste(days[i],"AQI",round(max(IAQI[i,])))) } ``` # 第二题 ## 2.1 数据处理 第二题,首先将附件一"监测点A逐小时污染物浓度与气象实测数据"中的"-"替换为"NA",再读入数据进行处理 ```{R} library(forecast) library(zoo) library(ggplot2) #install.packages("hrbrthemes") library(hrbrthemes) library(ggplot2) library(ggpubr) #install.packages("VIM") library(VIM) library(mice) ### 第二题 ------ # 读实测数据, 填充空---- #install.packages("VIM") library(VIM) # 将附件一中, A点每小时实测数据中的"-"换为"NA"值, 方便插值 A_real <- read.csv("E://2022数学建模//A点每时测量.csv", header = T, na.strings = 'NA', encoding = "UTF-8-BOM") head(A_real) table = read.table("E://2022数学建模//1_临界值.csv",header = T, sep = ',',encoding = 'UTF-8') ``` 查看六种污染物的密度曲线及箱线图,判断离群值,并将离群值赋值为空值: ```{R} a0 = ggplot(A_real,aes(x=SO2))+geom_density(colour="steelblue") b0 = ggplot(A_real,aes(x=NO2))+geom_density(colour="red") c0 = ggplot(A_real,aes(x=PM10))+geom_density(colour="green") d0 = ggplot(A_real,aes(x=PM2.5))+geom_density(colour="black") e0 = ggplot(A_real,aes(x=O3))+geom_density(colour="orange") f0 = ggplot(A_real,aes(x=CO))+geom_density(colour="purple") ggarrange(a0,b0,c0,d0,e0,f0, ncol = 2, nrow = 3) par(mfrow=c(2,3)) # 六个污染物的箱线图,并将离群值赋为空值 data_real = A_real[,-(2)] for (i in (2:7)) { outlier_values = boxplot(A_real[,(i+1)],main=colnames(A_real)[(i+1)])$out AAA = A_real[,(i+1)] AAA[which(AAA %in% outlier_values)]=NA data_real[,i]=AAA } head(data_real) ``` 将浓度小于0的值赋值为空值 ```{R} # 六个污染物的浓度小于0的值赋值为控制 for (i in (2:7)) { zero = which(data_real[,i]<0) AAA = data_real[,i] AAA[zero]=NA data_real[,i]=AAA } # 检验是否还有数据浓度小于0 which(data_real<0) ``` 对缺失值进行可视化: ```{R} # 缺失值总结 summary(data_real) # 哪些行有缺失值 head(data_real[!complete.cases(data_real),]) # 缺失值可视化 aggr(data_real, prop = F, number = T) # 缺失值总数 sum(is.na(data_real)) ``` 用多重插补法进行插补: ```{R} ## 填充缺失值 temp1 <-mice(data_real[,(-1)],m=6,method = "pmm",maxit = 30) # 缺失值可视化 aggr(complete(temp1), prop = F, number = T) # 插值的密度曲线 densityplot(temp1) # 补全后的A点实测值 complete_A_real = complete(temp1) head(complete_A_real) ``` 对插值后的数据进行可视化处理 ```{R} # 补全后的离群值 par(mfrow=c(2,3)) boxplot(complete_A_real[,1], main="SO2") boxplot(complete_A_real[,2], main="NO2") boxplot(complete_A_real[,3], main="PM10") boxplot(complete_A_real[,4], main="PM2.5") boxplot(complete_A_real[,5], main="O3") boxplot(complete_A_real[,6], main="CO") # 一次绘制六种污染物浓度的分布密度曲线(插值后) a1 = ggplot(complete_A_real,aes(x=SO2))+geom_density(colour="steelblue") b1 = ggplot(complete_A_real,aes(x=NO2))+geom_density(colour="red") c1 = ggplot(complete_A_real,aes(x=PM10))+geom_density(colour="green") d1 = ggplot(complete_A_real,aes(x=PM2.5))+geom_density(colour="black") e1 = ggplot(complete_A_real,aes(x=O3))+geom_density(colour="orange") f1 = ggplot(complete_A_real,aes(x=CO))+geom_density(colour="purple") ggarrange(a1,b1,c1,d1,e1,f1, ncol = 2, nrow = 3) # 插值后是否有缺失值检查 anyNA(complete_A_real) # 写为 .csv 文件 # write.csv(complete_A_real,'A点每时实测(插值).csv') ``` 下面计算各个污染物实测浓度的逐日均值: ```{R} # 计算各浓度的逐日均值 a = cbind(A_real[,1],complete_A_real); time=as.POSIXct(A_real[,1]) ; head(time) day=strftime(time,format = "%Y-%m-%d "); a[,1]=day; # 查看每个时间运行时间点的样本值, head(table(day)) head(unique(day)) complete_A_real_day = as.data.frame(a) complete_A_real_day = complete_A_real_day[-which(complete_A_real_day[,1]==as.Date("2021-07-13")),] # 不同污染物每天的时间点的平均浓度 df_real = aggregate(x = complete_A_real_day[, -1], by =list(complete_A_real_day[,1]), FUN = mean);head(df_real) anyNA(df_real) # 最大8小时平均算法 max_8 <-function(x){ x = as.vector(x) y = c() if(length(x)>8){ for (i in 1:(length(x)-7)) { y[i]=mean(x[i:(i+7)]) }}else{for (i in (1:length(x))) { y[i]=x[i] }} return(max(y)) } # 将臭氧的值替换为最大八小时平均 df_real[,6]=aggregate(x = complete_A_real_day[, 6], by = list(complete_A_real_day[,1]), FUN = max_8)[,-1]; names(df_real)= c("date","SO2","NO2","PM10","PM2.5","O3","CO","TEMPER","WET","Mbar","SPEED","DIRECTION") anyNA(df_real) head(df_real) # 写出 # write.csv(df_real, "E://2022数学建模//A点各浓度实测逐日均值.csv") ``` ## 2.2 聚类 ```{R} ## 聚类 ---- #要是没有这个包的话,首先需要安装一下 #install.packages("factoextra") #载入包 library(factoextra) newdata <- read.csv("E://2022数学建模//A点每时实测(插值).csv", header = T, na.strings = 'NA', encoding = "UTF-8-BOM") head(newdata) anyNA(newdata) # 分别聚类---- # 数据进行标准化 par(mfrow=c(4,3)) for (i in 4:14) { df <- scale(newdata[,i]) # 查看数据的前五行 head(df, n = 5) #利用k-mean是进行聚类 km_result <- kmeans(df, 5, nstart =2000,iter.max=5000) #提取类标签并且与原始数据进行合并 dd <- cbind(newdata, cluster = km_result$cluster);head(dd) class_1 = dd[which(dd$cluster==1),] class_2 = dd[which(dd$cluster==2),] class_3 = dd[which(dd$cluster==3),] class_4 = dd[which(dd$cluster==4),] class_5 = dd[which(dd$cluster==5),] plot(class_1[,i],class_1[,i],type="l", ylim = c(min(dd[,i]),max(dd[,i])), xlim=c(min(dd[,i]),max(dd[,i])), lwd = 3, xlab ="", ylab = "", main = paste(colnames(dd)[i]), col = "blue") lines(class_4[,i],class_4[,i],type="l",col="red",lwd = 3) lines(class_3[,i],class_3[,i],type="l",col="green",lwd = 3) lines(class_2[,i],class_2[,i],type="l",col="orange",lwd = 3) lines(class_5[,i],class_5[,i],type="l",col="purple",lwd = 3) print(paste("第",1,"类",colnames(dd)[i],"的浓度范围为:",min(class_1[i]),"至",max(class_1[i]),sep='')) print(paste("第",2,"类",colnames(dd)[i],"的浓度范围为:",min(class_2[i]),"至",max(class_2[i]),sep='')) print(paste("第",3,"类",colnames(dd)[i],"的浓度范围为:",min(class_3[i]),"至",max(class_3[i]),sep='')) print(paste("第",4,"类",colnames(dd)[i],"的浓度范围为:",min(class_4[i]),"至",max(class_4[i]),sep='')) print(paste("第",5,"类",colnames(dd)[i],"的浓度范围为:",min(class_5[i]),"至",max(class_5[i]),sep='')) } ``` # 第三题 ## 3.1 A点 读取A点每小时一次预报时间并进行处理 ```{R} A_forcast <- read.csv("E://2022数学建模//A点每时预测.csv", header = T, na.strings = 'NA', encoding = "UTF-8") # 令6个污染物的和5个气候变量的离群值赋为空值 par(mfrow=c(2,3)) data_forcast = A_forcast[,-(2:3)] for (i in (2:22)) { outlier_values = boxplot(data_forcast[,i])$out length(outlier_values) AAA = data_forcast[,i] AAA[which(AAA %in% outlier_values)]=NA data_forcast[,(i)]=AAA } head(data_forcast) # 六个污染物的浓度小于0的值赋值为空值 for (i in (2:7)) { zero = which(data_forcast[,i]<0) AAA = data_forcast[,i] AAA[zero]=NA data_forcast[,i]=AAA } # 多重插值 temp2 <-mice(data_forcast[,2:7], m=10,method = "pmm",maxit = 30) complete_A_forcast = complete(temp2) # 计算各浓度的逐日均值及O3的8小时最大平均浓度 aa = cbind(A_forcast[,1],complete_A_forcast); time=as.POSIXct(aa[,1]) day=strftime(time) aa[,1]=day # 查看每个时间运行时间点的样本值, 取前24个 complete_A_forcast_day = as.data.frame(matrix(NA, length(unique(day))*24,6)) for(i in (1:length(unique(aa[,1]))) ){ complete_A_forcast_day[((i-1)*24+1):(24*i),]=complete_A_forcast[which(aa[,1]==unique(aa[,1])[i])[1:24],] } # 不同污染物每天24个时间点的浓度 complete_A_forcast_day = cbind(rep(unique(day), rep(24,length(unique(day)))),complete_A_forcast_day) complete_A_forcast_day = complete_A_forcast_day[-which(complete_A_forcast_day[,1]==as.Date("2021-07-13")),] # 不同污染物每天的时间点的平均浓度 df_forcast = aggregate(x = complete_A_forcast_day[, 2:7], by =list(complete_A_forcast_day[,1]), FUN = mean) # 最大8小时平均算法 max_8 <-function(x){ x = as.vector(x) y = c() if(length(x)>8){ for (i in 1:(length(x)-7)) { y[i]=mean(x[i:(i+7)]) }}else{for (i in (1:length(x))) { y[i]=x[i] }} return(max(y)) } # 将臭氧的值替换为最大八小时平均 df_forcast[,6]=aggregate(x = complete_A_forcast_day[, 6], by = list(complete_A_forcast_day[,1]), FUN = max_8)[,-1];head(df_forcast) names(df_forcast)= c("date","SO2","NO2","PM10","PM2.5","O3","CO") anyNA(df_forcast) # 结果写为 .csv # write.csv(df_forcast,'E://2022数学建模//A点各浓度预测逐日均值.csv') ``` ### 3.1.1 对SO2进行预测 #### 1 读取数据 ```{R} # 读取数据---- A_df_forcast = read.csv("E://2022数学建模//A点各浓度预测逐日均值.csv", header = T, na.strings = 'NA', encoding = "UTF-8") A_df_forcast$date = as.Date(A_df_forcast$date) head(A_df_forcast) A_df_real = read.csv("E://2022数学建模//A点各浓度实测逐日均值.csv", header = T, na.strings = 'NA', encoding = "UTF-8") A_df_real$date = as.Date(A_df_real$date) head(A_df_real) ``` #### 2 构建第一模型 ```{R} library(nnet) # 提取一次预报值和实测值中重合的日期 date1 = unique(A_df_forcast$date);head(date1) gsub(" ","",date1) # 去除空格 date2 = unique(A_df_real$date);head(date2) gsub(" ","",date2) # 去除空格 date3 = intersect(date1, date2);date3=as.Date(date3);head(date3) ind1 = which(A_df_forcast$date %in% date3);ind1 ind2 = which(A_df_real$date %in% date3 );ind2 # step 1 用一次预报数据进行预测 ---- A_value_1_SO2 = A_df_forcast[ind1,]$SO2;head(A_value_1_SO2) A_value_2_SO2 = A_df_real[ind2,]$SO2;head(A_value_2_SO2) length(A_value_1_SO2)==length(A_value_2_SO2) # 保持 date 一致 A_value_1 = A_df_forcast[ind1,]$SO2;head(A_value_1) A_value_2 = A_df_real[ind2,]$SO2;head(A_value_2) # 构造线性模型 A_fit_nnet = nnet(A_value_2~A_value_1, maxit = 1000, linout = T, trace = F, size = 10) # 非线性拟合图 plot(A_value_2,type="l", col = "red",main="二次预报第一模型对二氧化硫污染物浓度的拟合值",ylab="浓度") lines(A_fit_nnet$fitted.values, col = "blue") legend("topright", inset=.05, title="图例", c("实测值","拟合值"), lty=c(1, 1), col=c("red", "blue")) # 线性残差图 plot(A_fit_nnet$residuals, type="l", main="二次预报第一模型对二氧化硫污染物浓度的拟合残差", ylab = "浓度") # RMSE RMSE1 = sqrt(mean(A_fit_nnet$residuals^2));RMSE1 # 构造下时间序列数据 A_mydata_1 <- ts(A_value_1, start = as.Date(date3[1]), frequency = 1);head(A_mydata_1) length(A_value_1) length(date3) #构造数据表 A_data_time_1 <- data.frame( day = date3, value = A_value_1 );head(A_data_time_1) #数据样式 head(A_data_time_1) head(A_mydata_1) # 建立nnar模型 A_fit1 <- nnetar(A_mydata_1) # 预测 A_fit1_fcast <- forecast(A_fit1,h=10,PI=TRUE) # 二次预报第一模型据得到的未来10天的预测值 c_1 = print(A_fit1_fcast) ``` #### 3 构造第二模型 ```{R} # 构造下时间序列数据 A_mydata_2 <- ts(A_value_2, start = as.Date(date3[1]), frequency = 1);head(A_mydata_2) #构造数据表 A_data_time2 <- data.frame( day = date3, value = A_value_2 );head(A_data_time2) # 进阶绘图 ggplot(A_data_time2, aes(x=day, y=value)) + geom_line(color="red", size=1) + geom_area(colour='red',alpha=0.4)+ scale_x_date(date_labels="%y/%m", date_breaks="month", expand=c(0,0)) + scale_y_continuous(expand = c(0,0))+ facet_grid(~strftime(A_data_time2$day,format = "%Y-%m "), space="free_x", scales="free_x", switch="y") + theme_bw() + theme(#strip.placement = "outside", strip.background = element_rect(fill = "skyblue"), panel.spacing = unit(0, "cm"))+ labs(title="二氧化硫浓逐日实测浓度", x="DATE", y="浓度") #数据样式 head(A_data_time2) head(A_mydata_2) # 建立nnar模型 A_fit2 <- nnetar(A_mydata_2) # 拟合值 autoplot(A_fit2$fitted) # 判断NA值. 因为该时间序列模型无法对前m各样本进行拟合,故前m个样本的拟合值为NA, 故首先标记出m m = max(which(is.na(A_fit2$fitted)));m # GGPLOT 真实值与拟合值图 A_comparison_2 = as.data.frame(cbind(date3[-(1:m)] ,A_value_2[-(1:m)], A_fit2$fitted[-(1:m)])) A_line_2=ggplot(A_comparison_2)+ geom_line(aes(x=A_comparison_2[,1],y=A_comparison_2[,2]),color="red",size=1)+ geom_line(aes(x=A_comparison_2[,1],y=A_comparison_2[,3]),color="blue",size=1)+ labs(title="日均二氧化硫浓度拟合值 (用日均实际测量值拟合)", x="日期", y="浓度") A_line_2 par(mfrow=c(1,1)) # PLOT 拟合图 plot(A_value_2[-(1:m)],type="l", col = "red",main="二次预报第二模型对二氧化硫污染物浓度的拟合值",ylab="浓度");lines(A_fit2$fitted[-(1:m)], col = "blue");legend("topright", inset=.05, title="图例", c("实测值","拟合值"), lty=c(1, 1), col=c("red", "blue")) # PLOT 残差图 plot(A_fit2$residuals[-(1:m)], type="l", main="二次预报第二模型对二氧化硫污染物浓度的拟合残差", ylab = "浓度") # 第一模型与第二模型拟合值残差对比图 plot(A_fit_nnet$residuals[-(1:m)], type="l", col = "blue", main = "第一模型与第二模型对二氧化硫污染物浓度的拟合残差对比图", ylab = "残差");lines(A_fit2$residuals[-(1:m)], col = "red");legend("topright", inset=.05, title="图例", c("第一模型残差","第二模型残差"), lty=c(1, 1), col=c("blue", "red")) # 第二模型预测 A_fit2_fcast <- forecast(A_fit2,h=10,PI=TRUE); # 实测数据得到的未来10天的预测值 c_2 = print(A_fit2_fcast);c_1;c_2 plot(A_fit2_fcast) # 画预测图 plot(A_fit2_fcast, xlim = c(18700,18830),main="13-15日监测点A日均SO2浓度的神经网络时间序列预测", xlab="日期", ylab="浓度") ## 计算RMSE RMSE2 = sqrt(mean((A_fit2$residuals[-which(is.na(A_fit2$residuals))]^2)));RMSE2 ``` #### 4 构建二次预报模型 ```{R} ## step 3 ---- ## 根据RMSE给两个模型赋权 gamma1 = 0.1 gamma2 = 0.9 # 第二模型对二氧化硫的拟合值 A_fit_final = A_fit_nnet$fitted.values[-(1:m)]*gamma1+A_fit2$fitted[-(1:m)]*gamma2 # 二次预报模型对二氧化硫浓度的拟合值与真实值对比图 plot(A_value_2[-(1:m)],xaxt="n",type="l",col="red",main="二次预报模型对二氧化硫污染物浓度的拟合值与真实值",ylab="浓度",xlab="date");lines(A_fit_final, col="blue");legend("topright", inset=.05, title="图例", c("真实值","拟合值"), lty=c(1, 1), col=c("red", "blue"));axis(1,at=seq(1,length(date3)-m),label=date3[-(1:m)]) # 二次预报模型的RMSE均小于第一模型和第二模型的RMSE RMSE = sqrt(mean((A_value_2[-(1:m)]-A_fit_final)^2));RMSE8){ for (i in 1:(length(x)-7)) { y[i]=mean(x[i:(i+7)]) }}else{for (i in (1:length(x))) { y[i]=x[i] }} return(max(y)) } # 将臭氧的值替换为最大八小时平均 B_df_forcast[,6]=aggregate(x = complete_B_forcast_day[, 6], by = list(complete_B_forcast_day[,1]), FUN = max_8)[,-1]; names(B_df_forcast)= c("date","SO2","NO2","PM10","PM2.5","O3","CO") anyNA(B_df_forcast) # 结果写为 .csv # write.csv(B_df_forcast,'E://2022数学建模//B点各浓度预测逐日均值.csv') ``` #### 2 对B点“每小时实测数据进行处理” ```{R} # 将附件一中, B点每小时实测数据中的"-"换为"NA"值, 方便插值 B_real <- read.csv("E://2022数学建模//B点每时测量.csv", header = T, na.strings = 'NA', encoding = "UTF-8-BOM") table = read.table("E://2022数学建模//1_临界值.csv",header = T, sep = ',',encoding = 'UTF-8-BOM') # 六个污染物的的离群值赋为空值 B_data_real = B_real[,-(2)];head(B_data_real) for (i in (2:7)) { outlier_values = boxplot.stats(as.numeric(B_real[,(i+1)]))$out AAA = B_real[,(i+1)] AAA[which(AAA %in% outlier_values)]=NA B_data_real[,i]=AAA } head(B_data_real) # 六个污染物的浓度小于0的值赋值为控制 for (i in (2:7)) { zero = which(B_data_real[,i]<0) AAA = B_data_real[,i] AAA[zero]=NA B_data_real[,i]=AAA } temp1_B <-mice(as.data.frame(lapply(B_data_real[,-1],as.numeric)),m=10,method = "pmm",maxit = 30) complete_B_real = complete(temp1_B) # 计算各浓度的逐日均值 B = cbind(B_real[,1],complete_B_real) B_time=as.POSIXct(B_real[,1]) ;head(B_day) B_day=strftime(B_time,format = "%Y-%m-%d ") B[,1]=B_day # 查看每个时间运行时间点的样本值, complete_B_real_day = as.data.frame(B) complete_B_real_day = complete_B_real_day[-which(complete_B_real_day[,1]==as.Date("2021-07-13")),] # 不同污染物每天的时间点的平均浓度 B_df_real = aggregate(x = complete_B_real_day[, -1], by =list(complete_B_real_day[,1]), FUN = mean) # 最大8小时平均算法 max_8 <-function(x){ x = as.vector(x) y = c() if(length(x)>8){ for (i in 1:(length(x)-7)) { y[i]=mean(x[i:(i+7)]) }}else{for (i in (1:length(x))) { y[i]=x[i] }} return(max(y)) } # 将臭氧的值替换为最大八小时平均 B_df_real[,6]=aggregate(x = complete_B_real_day[, 6], by = list(complete_B_real_day[,1]), FUN = max_8)[,-1] names(B_df_real)= c("date","SO2","NO2","PM10","PM2.5","O3","CO") # 写出 # write.csv(B_df_real, "E://2022数学建模//B点各浓度实测逐日均值.csv") ``` #### 4 读取数据 ```{R} B_df_forcast = read.csv("E://2022数学建模//B点各浓度预测逐日均值.csv", header = T, na.strings = 'NA', encoding = "UTF-8-BOM") B_df_real = read.csv("E://2022数学建模//B点各浓度实测逐日均值.csv", header = T, na.strings = 'NA', encoding = "UTF-8-BOM") ``` #### 5 第一模型 ```{R} # 提取一次预报值和实测值中重合的日期 date1 = unique(B_df_forcast$date); date1 = gsub(" ","",date1) # 去除空格 date2 = unique(B_df_real$date); date2 = gsub(" ","",date2)# 去除空格 date3 = intersect(date1, date2); ind1 = which(gsub(" ","",B_df_forcast$date) %in% date3);ind1 ind2 = which(gsub(" ","",B_df_real$date) %in% date3 );ind2 # step 1 用一次预报数据进行预测 ---- B_value_1_SO2 = B_df_forcast[ind1,]$SO2; B_value_2_SO2 = B_df_real[ind2,]$SO2; length(B_value_1_SO2)==length(B_value_2_SO2) # 保持 date 一致 # 构造神经网络模型 B_fit_nnet_SO2 = nnet(B_value_2_SO2~B_value_1_SO2, maxit = 1000, linout = T, trace = F, size = 20) RMSE1 = sqrt(mean(B_fit_nnet_SO2$residuals^2));RMSE1 # 构造时间序列数据 B_t_1 <- as.Date(B_df_forcast$date);B_t_1 B_mydata_1_SO2 <- ts(B_value_1_SO2, start = as.Date(B_t_1[1]), frequency = 1);head(B_mydata_1_SO2) # 建立nnar模型 B_fit1_SO2 <- nnetar(B_mydata_1_SO2) # 预测 B_fit1_fcast_SO2 <- forecast(B_fit1_SO2,h=10,PI=TRUE) ``` #### 4第二模型 ```{R} B_t_2 <- B_t_1 # 构造下时间序列数据 B_mydata_2_SO2 <- ts(B_value_2_SO2, start = as.Date(B_t_2[1]), frequency = 1);head(B_mydata_2_SO2) length(B_value_2_SO2)==length(B_t_2) # 建立nnar模型 B_fit2_SO2 <- nnetar(B_mydata_2_SO2) # 判断NA值. 因为该时间序列模型无法对前m各样本进行拟合,故前m个样本的拟合值为NA, 故首先标记出m m = max(which(is.na(B_fit2_SO2$fitted)));m # 第二模型预测 B_fit2_fcast_SO2 <- forecast(B_fit2_SO2,h=10,PI=TRUE); # 实测数据得到的未来10天的预测值 c_2 = print(B_fit2_fcast_SO2);c_1;c_2 ## 计算RMSE RMSE2 = sqrt(mean((B_fit2_SO2$residuals[-(1:m)]^2)));RMSE2 ``` #### 5 二次预报模型 ```{R} ## 根据RMSE给两个模型赋权 gamma1 = 0 gamma2 = 1 # 第二模型对二氧化硫的拟合值 B_fit_final_SO2 = B_fit_nnet_SO2$fitted.values[-(1:m)]*gamma1+B_fit2_SO2$fitted[-(1:m)]*gamma2 # 二次模型的预测值 c = gamma1*c_1 + gamma2*c_2;c B_fit_SO2 = B_fit2_fcast_SO2 B_fit_SO2$mean = ts(c[,1],start = 18818) B_fit_SO2$lower = ts(c[,c(2,4)],start = 18818) B_fit_SO2$upper = ts(c[,c(3,5)],start = 18818) plot(B_fit_SO2, xlim = c(18700,18827), main="二次预报模型对二氧化硫污染物浓度的预测值及置信区间", ylab = "浓度", xlab = "日期") # SO2 13-15日预测值最终结果 B_fit_SO2$mean[1:3] ``` ## 3.3 C点 与3.2节内容完全相似,故不单独叙述,完整代码见附件“问题三 C” # 第四题 读取A1 A2 A3 每小时实测 浓度(6个),温度, 湿度, 速度, 方向(4个),共十个指标,分别对“逐小时一次预报数据”和“每小时实测数据”,进行数据预处理。 ## 4.1 A1 ```{R} ## A1 ---- A1_real <- read.csv("E://2022数学建模//A1点每时测量.csv", header = T, na.strings = 'NA', encoding = "UTF-8-BOM") # 转为数值型 A1_real[,-(1:2)]=as.data.frame(lapply(A1_real[,-(1:2)],as.numeric)) A1_real$TIME = strptime(A1_real$TIME, format = "%Y/%m/%d %H:%M") # 由于2019/11/25 9:00 前的缺失数据太多, 故取2019/11/25 9:00 以后的数据作为建模数据集 A1_real = A1_real[-(1:which(A1_real$TIME==strptime("2019/11/25 09:00", format = "%Y/%m/%d %H:%M"))),] # 六个污染物的的离群值赋为空值 A1_data_real = A1_real[,-(2)];head(A1_data_real) for (i in (2:7)) { outlier_values = boxplot.stats(as.numeric(A1_real[,(i+1)]))$out AAA = A1_real[,(i+1)] AAA[which(AAA %in% outlier_values)]=NA A1_data_real[,i]=AAA } # 六个污染物的浓度小于0的值赋值为控制 for (i in (2:7)) { zero = which(A1_data_real[,i]<0) AAA = A1_data_real[,i] AAA[zero]=NA A1_data_real[,i]=AAA } ## 填充缺失值 temp1_A1 <-mice(as.data.frame(lapply(A1_data_real[,-1],as.numeric)),m=10,method = "pmm",maxit = 10) complete_A1_real = cbind(A1_data_real$TIME,complete(temp1_A1)) # 写出 # write.csv(complete_A1_real,"E://2022数学建模//A1点每时实测(插值).csv") ``` ## 4.2 A2 ```{R} ## A2 ---- A2_real <- read.csv("E://2022数学建模//A2点每时测量.csv", header = T, na.strings = 'NA', fileEncoding = "UTF-8-BOM") # 转为数值型 A2_real[,-(1:2)]=as.data.frame(lapply(A2_real[,-(1:2)],as.numeric)) A2_real$TIME = strptime(A2_real$TIME, format = "%Y/%m/%d %H:%M") # 由于2019/11/25 9:00 前的缺失数据太多, 故取2019/11/25 9:00 以后的数据作为建模数据集 A2_real = A2_real[-(1:which(A2_real$TIME==strptime("2019/11/25 09:00", format = "%Y/%m/%d %H:%M"))),] # 六个污染物的的离群值赋为空值 A2_data_real = A2_real[,-(2)];head(A2_data_real) for (i in (2:7)) { outlier_values = boxplot.stats(as.numeric(A2_real[,(i+1)]))$out AAA = A2_real[,(i+1)] AAA[which(AAA %in% outlier_values)]=NA A2_data_real[,i]=AAA } # 六个污染物的浓度小于0的值赋值为控制 for (i in (2:7)) { zero = which(A2_data_real[,i]<0) AAA = A2_data_real[,i] AAA[zero]=NA A2_data_real[,i]=AAA } ## 填充缺失值 temp1_A2 <-mice(as.data.frame(lapply(A2_data_real[,-1],as.numeric)),m=10,method = "pmm",maxit = 30) complete_A2_real = cbind(A2_data_real$TIME,complete(temp1_A2)) # 写出 # write.csv(complete_A2_real,"E://2022数学建模//A2点每时实测(插值).csv") ``` ## 4.3 A3 ```{R} ## A3 ---- A3_real <- read.csv("E://2022数学建模//A3点每时测量.csv", header = T, na.strings = 'NA', fileEncoding = "UTF-8-BOM") head(A3_real) # 转为数值型 A3_real[,-(1:2)]=as.data.frame(lapply(A3_real[,-(1:2)],as.numeric)) A3_real$TIME = strptime(A3_real$TIME, format = "%Y/%m/%d %H:%M") # 由于2019/11/25 9:00 前的缺失数据太多, 故取2019/11/25 9:00 以后的数据作为建模数据集 A3_real = A3_real[-(1:which(A3_real$TIME==strptime("2019/11/25 09:00", format = "%Y/%m/%d %H:%M"))),] # 六个污染物的的离群值赋为空值 A3_data_real = A3_real[,-(2)];head(A3_data_real) for (i in (2:7)) { outlier_values = boxplot.stats(as.numeric(A3_real[,(i+1)]))$out AAA = A3_real[,(i+1)] AAA[which(AAA %in% outlier_values)]=NA A3_data_real[,i]=AAA } # 六个污染物的浓度小于0的值赋值为控制 for (i in (2:7)) { zero = which(A3_data_real[,i]<0) AAA = A3_data_real[,i] AAA[zero]=NA A3_data_real[,i]=AAA } ## 填充缺失值 temp1_A3 <-mice(as.data.frame(lapply(A3_data_real[,-1],as.numeric)),m=10,method = "pmm",maxit = 30) # 补全后的A3点实测值 complete_A3_real = cbind(A3_data_real$TIME,complete(temp1_A3)) # 写出 # write.csv(complete_A3_real,"E://2022数学建模//A3点每时实测(插值).csv") ``` ## 4.4 构建数据集 ```{R} # 读取A点实测数据(插值) ---- A_complete_real = read.csv("E://2022数学建模//A点每时实测(插值).csv", header = T, na.strings = 'NA', fileEncoding = "UTF-8-BOM") # 转为数值型 A_complete_real[,-(1:3)]=as.data.frame(lapply(A_complete_real[,-(1:3)],as.numeric)) A_complete_real$TIME = strptime(A_complete_real$TIME, format = "%Y/%m/%d %H:%M") # 由于2019/11/25 9:00 前的缺失数据太多, 故取2019/11/25 9:00 以后的数据作为建模数据集 A_complete_real = A_complete_real[-(1:which(A_complete_real$TIME==strptime("2019/11/25 09:00", format = "%Y/%m/%d %H:%M"))),] # 构建数据集---- # 提取一次预报值和实测值中重合的日期 date1 = unique(complete_A1_real$`A1_data_real$TIME`) date2 = unique(complete_A2_real$`A2_data_real$TIME`);head(date2) date3 = unique(complete_A3_real$`A3_data_real$TIME`);head(date3) date4 = unique(A_complete_real$TIME);head(date4);date4 = as.POSIXct(date4) date5 = intersect(intersect(date1, date2), intersect(date3, date4)) ind1 = which(complete_A1_real$`A1_data_real$TIME` %in% date5) ind2 = which(complete_A2_real$`A2_data_real$TIME` %in% date5) ind3 = which(complete_A3_real$`A3_data_real$TIME` %in% date5) ind4 = which(as.POSIXct(A_complete_real$TIME, format = "%Y/%m/%d %H:%M") %in% date5) data1 = complete_A1_real[ind1,-1] data2 = complete_A2_real[ind2,-1] data3 = complete_A3_real[ind3,-1] data4 = A_complete_real[ind4,c(2,4,5,6,7,8,9)] data = cbind(data4, data1, data2, data3) head(data) ``` ### 4.5 建模 以SO2为例: ```{R} # 由A1 A2 A3 到A的距离来赋权 weight1 = 14/sum(14,6.0323,10.10934) weight2 = 10.10934/sum(14,6.0323,10.10934) weight3 = 6.0323/sum(14,6.0323,10.10934) weights = rep(1,dim(data)[1]) # 建立随机森林模型 library(randomForest) model.forest<-randomForest(x=data[,-(1:7)], y=data[,2]) # 拟合 fit.value <-predict(model.forest,data[,-(1:7)]) # 读取A点一次预报数据 A_forcast = read.csv("E://2022数学建模//A点每时预测.csv", header = T, na.strings = 'NA', fileEncoding = "UTF-8-BOM") # 只保留离一次预报日期最近的数据 A_forcast = A_forcast[!duplicated(A_forcast$TIME), ] date5 = intersect(as.POSIXct(data$TIME,format = "%Y/%m/%d %H:%M"),as.POSIXct(A_forcast$TIME, format = "%Y/%m/%d %H:%M")) ind5 = which(as.POSIXct(data$TIME,format = "%Y/%m/%d %H:%M") %in% as.POSIXct(A_forcast$TIME, format = "%Y/%m/%d %H:%M")) ind6 = which(as.POSIXct(A_forcast$TIME, format = "%Y/%m/%d %H:%M") %in% as.POSIXct(data$TIME,format = "%Y/%m/%d %H:%M") ) SO2_forcast = A_forcast[ind6,]$SO2 # 画图 par(mfrow=c(1,1)) plot(SO2_forcast,type="l",xaxt="n",col="gray",ylab="SO2",main="协同模型对SO2的拟合值");lines(data[ind5,2], col = "red");lines(fit.value[ind5],col="blue",type="l");legend("topleft", inset=.05, title="图例", c("实测值","协同模型拟合值","一次预报值"),lty=c(1, 1), col=c("red", "blue","gray"));axis(1,at=seq(1,length(SO2_forcast)),label=A_forcast[ind6,]$TIME) ```