diff --git a/README.md b/README.md index e69de29..1a0e11e 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,52 @@ +# AQI Prediction and Analysis Project + + +## 📋 目录结构 + +``` +. +├── hw1/ +│ └── data/ +│ └── AQIDataset.csv # 原始数据集 +├── src/ +│ ├── data_processing.py # 数据加载、预处理与交叉验证划分 +│ ├── models.py # 模型实现 (GD, LWLR, Lasso, Ridge) +│ └── experiments.py # 实验主程序 (训练、评估、绘图) +├── report/ +│ ├── images/ # 实验生成的图表 +│ └── 机器学习.md # 详细实验报告 +├── README.md # 项目说明文档 +└── requirements.txt # 依赖列表 +``` + +## 🛠️ 环境要求 + +安装命令: +```bash +pip install pandas numpy matplotlib seaborn scikit-learn +``` + +## 🚀 如何运行 + +在项目根目录下,运行以下命令即可执行所有实验并生成图表: + +```bash +python src/experiments.py +``` + +1. 控制台会输出各步骤的运行日志及最终的模型评估结果。 +2. 生成的分析图表将保存在 `report/images/` 目录下。 +3. 查阅 `report/机器学习.md` 获取详细的分析报告。 + +## 📊 主要结果 + +| 模型 | RMSE | MAE | +| :--- | :--- | :--- | +| Gradient Descent | ~32.84 | ~24.69 | +| Lasso | ~33.16 | ~24.95 | +| Ridge | ~33.11 | ~24.89 | +| **LWLR (k=2.0)** | **~30.36** | **~22.68** | + +* **最佳模型**: 局部加权线性回归 (LWLR) 在 $k=2.0$ 时表现最佳。 +* **主要影响因素**: 纬度、温度和海拔。 + diff --git a/hw1/data/AQIDataset.csv b/hw1/data/AQIDataset.csv new file mode 100644 index 0000000..dc6cf72 --- /dev/null +++ b/hw1/data/AQIDataset.csv @@ -0,0 +1,324 @@ +City,AQI,Precipitation,GDP,Temperature,Longitude,Latitude,Altitude,PopulationDensity,Coastal,GreenCoverageRate,"Incineration(10,000ton)" +Ngawa Prefecture,23,665.1,271.13,8.2,102.22465,31.89941,2617,11,0,36,23 +Aksu City,137,80.4,610,12.27671233,80.26338,41.16754,1108,6547,0,33.94,23 +Alxa League,85,150,322.58,24.2,105.72895,38.85192,1673,1,0,36,23 +Ngari,28,74.2,37.4,1,80.1058,32.50111,4280,1,0,36,23 +Anqin City,79,2127.8,1613.2,17.29178082,117.0344315,30.51264572,13,2271,0,45.8,27.48 +Anyang City,110,672.1,1884.48,15.35068493,114.3500519,36.09685135,71,4735,0,36.28,54.51 +Anshan City,111,708,2326,11.41232877,122.9843826,41.11525726,49,2534,0,38.25,58.13 +Macao SAR,44,2000,3358,22,113.54387,22.19874,40,20547,1,41,22 +Bayannur City,53,188,887.43,3.7,107.38765,40.74321,1042,5093,0,35.12,15.98 +Bazhong City,58,1198.9,501.34,16.9,106.75476,31.849014,443,2653,0,39.11,15.09 +Bengbu City,96,910,1253.1,16.01369863,117.3613815,32.93924332,25,2599,0,38.48,29.61 +Baotou City,45,262.9,3781.93,8.91369863,109.8517075,40.6664238,1065,2130,0,37,52.29 +Baoji City,111,695,1788.59,14.83424658,107.1383591,34.38228607,567,5352,0,40.61,36.54 +Baoding City,220,566.9,2757.8,13.25890411,115.5001831,38.85707092,17.2,4565,0,30.96,49.27 +Baoshan City,56,1000,552,17.39178082,99.16872406,25.11680222,1673,4529,0,31,10.95 +Beihai City,23,1670,892.1,24.61643836,109.1191711,21.47979736,21,458,1,27.45,24.74 +Beijing City,296,458.9,22968.6,13.77260274,116.3809433,39.9236145,31.2,1292,0,34,686.67 +Benxi City,63,900,1160,9.271232877,123.7645035,41.28758621,100,602,0,44.88,31.76 +Binzhou City,142,564.8,2355.33,14.11643836,118.0217667,37.36781311,9,1110,0,42.92,27.99 +Haozhou City,59,831,942.6,10.58695652,115.7709,33.879292,39,3693,0,41.32,17.66 +Cangzhiu City,105,581,3133.38,14.30273973,116.8607712,38.30884171,12,3169,1,42.9,24 +Qamdo City,31,477.7,136,7.6,97.18152,31.13818,3230,5714,0,7.6,3.5 +Changde City,69,1550,2709.02,17.93287671,111.6876297,29.03820992,38,2366,0,36.26,26.81 +Changshu City,51,1615.3,2100,17.27534247,120.75248,31.65437,7,1200,0,42.37,28.61 +Changzhou City,116,1450.9,5273.15,16.9739726,119.9502869,31.78393364,3,2399,0,42.47,76.48 +Chaohu City,22,1215,273.1,16.82876712,117.89035,31.62452,13,7708,0,42.93,13.5 +Chaoyang City,224,450,780,13.72739726,120.4514694,41.57785797,168,1022,0,40.57,33 +Chaozhou City,106,1685.8,910.1,23.19589041,116.63666,23.667706,18,3503,0,32.16,51.94 +Chenzhou City,49,1452.1,2012.07,19.67808219,113.0286484,25.80229187,189,1081,0,39.56,21.78 +Chengdu City,118,873.3,10912.17,18.22191781,104.0817566,30.66105652,485,6116,0,39.84,273.55 +Chengde City,195,440.5,1342.55,10.08219178,117.9223404,40.96760178,325,89,0,33,30 +Chizhou City,40,1959.4,530.6,17.30547945,117.48,30.67,18,1199,0,36.51,14.06 +Chifeng City,91,374,1861.27,8.434246575,118.9498215,42.26798248,587,1754,0,42.2,42.03 +Chuzhou City,63,1035,1305.7,15.50273973,118.3011627,32.31653214,31,1456,0,47.17,12.57 +Chuxiong Prefecture,44,900,762.97,14.8,101.52806,25.04553,1777,5574,0,36.15,10.95 +Dazhou City,86,1086.1,1353.67,18.75205479,107.5003433,31.22469711,306,974,0,29.57,23.12 +Dali City,39,1768.9,901.07,16.4,100.26763,25.60648,2007,3515,0,37.11,21.5 +Dalian City,132,601.9,7700,12.40273973,121.6008377,38.91780472,29,2820,1,19.38,122.81 +Daqing City,61,427.5,2983.5,6.167123288,125.0248566,46.59545136,149,4667,0,43.7,43.64 +Datong City,82,56.1,1060,7.468493151,113.2963333,40.0971489,1044,9492,0,36,23 +Dandong City,86,878.5,990.5,10.24520548,124.3814621,40.13518143,5,3237,1,48.39,21.37 +Dehong Prefecture,49,1766.9,292.3,18.4,98.58489,24.43335,927,110,0,36,10 +Deyang City,89,711,1605.1,18.13561644,104.3915482,31.13044548,491,2950,0,40.12,18.25 +Dezhou City,116,547.5,2750,15.0109589,116.2878723,37.45369339,23,1448,0,41.3,30.29 +Diqing Prefecture,23,824,161.1,13,99.70225,27.81875,3294,17,0,36,23 +Dongguan City,26,2137.9,6275.06,23.84931507,113.7487717,23.0485363,6,2417,0,36.98,390.43 +Dongying City,199,684.7,3450,14.82876712,118.4959564,37.46191406,2,617,1,34.61,22.32 +Erdos City,61,348.3,4226.13,6.2,109.8,39.62,1311,2624,0,36.04,22.4 +Ezhou City,113,1364.1,730,18.29863014,114.8811874,30.40276718,17,1740,0,36.18,17.85 +Enshi Prefecture,75,1535,670,16,109.48817,30.2721,490,4942,0,17.12,11.96 +Erenhot City,57,136.6,100.73,3.4,111.977942,43.65316,963,1471,0,35.53,7.2 +Foshan City,23,2055.2,8003.92,23.77260274,113.1145172,23.03487778,5,2469,0,43.73,93.84 +Fuzhou City,37,1775.2,5618.1,21.41232877,119.2978134,26.07859039,10,2322,1,41.09,105.9 +Fushun City,67,760,1236,7.48630137,123.9295578,41.84786606,83,2069,0,33.96,38.22 +Fuzhou City(Jiangxi),42,1700,1105.14,19.54931507,116.35,28,45,6648,0,39.78,22.34 +Fuxin city,141,480,530,9.308219178,121.6488037,42.00795364,146,1741,0,34.24,29.2 +Fuyang City(Anhui),44,910,1267.4,17.80958904,115.8097305,32.90220642,33,2290,0,29.26,25.23 +Fuyang City(Zhejiang),80,2137.7,643.6,16.0890411,119.96007,30.04869,21,360,0,36,29 +Garze Prefecture,17,920,217.57,13.5,101.96231,30.04952,2552,8,0,36,23 +Ganzhou City,32,1469,1973.87,20.96712329,114.9336777,25.85288239,109,8047,0,37.14,37.63 +Kaohsiung City,68,1738,4095.28,24.2,120.301435,22.62727,9,941,1,44,136.24 +Guang'an City,64,1071.2,1005.6,18.29589041,106.63175,30.474428,302,2106,0,39.49,14.34 +Guangyuan City,46,892.8,605.43,17.27671233,105.8317032,32.44396973,488,2064,0,37.15,18.66 +Guangzhou City,20,2478.1,18100.41,23.60547945,113.2614288,23.11891174,18,5940,1,18.78,455.84 +Guiyang City,36,1433.2,2891.16,19.07945205,106.7113724,26.57687378,1277,2163,0,38.57,111.88 +Guilin City,61,1901,1942.97,20.24520548,110.2866821,25.28188324,154,1535,0,43.63,38.42 +Harbin City,102,423.1,5751.2,5.836986301,126.6433411,45.74149323,118,11366,0,43.7,143.26 +Haikou City,22,1674.3,1161.28,26.09041096,110.3465118,20.03179359,15,2916,1,43.36,78.19 +Haimen City,45,581,915.2,16.53972603,121.18161,31.87117,5,4900,1,41.23,9.06 +Handan City,113,458,3080,15.28356164,114.4729538,36.60151672,55,780,0,32,49 +Hangzhou City,103,2137.7,10053.58,17.90547945,120.1592484,30.26599503,19,3527,0,42.62,351.98 +Hefei City,99,1224.9,5660.3,17.125,117.2757034,31.86325455,38,3357,0,36.66,122.05 +Heyuan City,16,1700,810.08,22.77260274,114.6938171,23.73484039,34,8950,0,30.7,21.34 +Heze City,88,831,2400.96,15.62054795,115.4457626,35.24853897,54,1762,0,44.68,40.15 +Hebi City,85,534,713.23,14.30684932,114.1546707,35.94008255,89,3602,0,38.69,15.96 +Hengshui City,142,547,1139,14.65479452,115.7081909,37.72782135,24,503,0,30,27 +Hengyang City,66,142,2601.57,19.37945205,112.5993576,26.90055466,82,8360,0,45.11,43.82 +Honghe Prefecture,75,1893,1222,18.1,103.374575,23.36225,1304,137,0,36,23 +Hohhot City,49,361.2,3090.52,6,111.6632996,40.82094193,1072,177,0,36,23 +Hulunbeier City,42,250,1595.96,0,119.77,49.22,612,10,0,33,22 +Huludao City,181,878,720,10.55342466,120.83,40.72,17,883,1,29.84,25.25 +Huzhou City,74,1450,2084.3,17.14520548,120.0971298,30.86603928,3,1408,0,42.5,45.77 +Hualien County,24,1268,417,23,121.6015714,23.9871589,16,72,1,46,24 +Huai'an City,101,878,2745.09,13.6,119.14111,33.502789,13,6461,0,40.14,43.26 +Huaibei City,97,910,760.4,16.19726027,116.7874985,33.9704895,32,3883,0,43.94,21.75 +Huainan City,73,910,770.6,17.13150685,117.0207291,32.6166954,52,2213,0,38.88,32.1 +Huanggang City,106,1550,1589.24,17.80273973,114.87,30.45,49,5996,0,40.16,18.76 +Huangshan mountain Scenic Spot,88,440,544.7,8.3,118.1299,30.13242,710,90,0,33,23 +Huangshan City,29,1469,544.7,17.73150685,118.3090668,29.72084427,122,833,0,33.91,12.61 +Huangshi City,105,2137,1235,18.05616438,115.0749893,30.21379852,21,3698,0,39.65,29.2 +Huizhou City,18,1670,3140.03,23.98219178,114.3924026,23.08795738,21,1372,1,40.03,95.29 +Keelung City,13,1775,514,21,121.7391833,25.1276033,10,2802,1,41,24 +Ji'an City,40,1035,1328.52,18.32191781,114.9704285,27.1062088,53,1898,0,31.9,16.65 +Jilin City,71,308,2455.2,7.176712329,126.5668182,43.88667679,202,2559,0,19.45,33.83 +Jimo City,61,1670,1110,14.21506849,120.44716,36.389401,22,913,1,44.57,25.02 +Jinan City,107,713.7,6130.68,15.58356164,117.0056,36.6670723,148,2504,0,50.82,137.28 +Jining City,83,831,4013.12,13.85616438,116.576561,35.40924072,39,1583,0,44.92,49.05 +Jiyuan City,140,480,494.41,14.6,112.601918,35.067243,146,6108,0,40.4,18.96 +Jiaxing City,97,574,3517.06,17.26986301,120.7536316,30.77111435,4,3742,1,37.83,33.39 +Chiayi County,51,1775,578,22.6,120.2554615,23.4518428,2,273,1,46,24 +Jiayuguan City,75,267,195,7.2,98.27471161,39.80265427,1656,1826,0,39.5,7.3 +Jiangmen City,46,1966,2240.02,23.90136986,113.0847473,22.59119034,6,2032,0,40.42,45.67 +Jiangyin City,82,581,2880.86,17.05890411,120.284938,31.920658,9,1865,1,42.98,18.15 +Jiaonan City,85,878,981.15,10.94246575,120.18,35.97,6,90,1,44,24 +Jiaozhou City,54,1742,981.2,14.39315068,120.006202,36.285877,10,832,1,39.44,18.94 +Jiaozuo City,199,689,1943.37,16.51506849,113.2217865,35.24735641,100,5564,0,35.39,27.88 +Jieyang city,62,1775,1890.01,21.4,116.34977,23.542976,4,2095,0,40.11,33.49 +Jinchang City,108,480,224.5,10.7,102.1657486,38.49519348,1526,3702,0,36.29,9.2 +Jinhua City,59,1638,3406.5,18.80410959,119.6522064,29.11081696,43,1720,0,36.06,45.72 +Kinmen County,35,1775,152.91,21,118.3285644,24.3487792,-12,914,1,46,22 +Jintan City,106,1742,471.48,16.86986301,119.597896,31.723247,7,567,0,32,14 +Jinzhou City,202,547,1357.5,12.18630137,121.1333694,41.11112595,23,309,1,44,25 +Jincheng City,102,592,1040.2,13.04383562,112.84272,35.50651169,703,244,0,33,17 +Jinzhong City,110,440,1046.12,11.93424658,112.75,37.68,811,203,0,33,23 +Jingmen City,110,534,1388.46,16.70273973,112.2002106,31.03021622,87,1995,0,32.37,14.86 +Jingzhou City,100,1550,1590.5,17.50547945,112.23,30.33,34,9389,0,37.3,23.51 +Jingdezhen City,48,1700,772.06,19.07123288,117.1179428,29.19516754,35,2522,0,41.11,14.78 +Jiujiang City,47,1550,1902.68,17.92191781,115.984581,29.72321129,41,6055,0,41.22,21.57 +Jurong City,65,1200,468.5,16.54109589,119.168695,31.944998,24,1804,0,40.38,7.37 +Kaifeng City,65,1450,1604.84,15.99863014,114.3461685,34.7851944,76,5601,0,38.69,31.62 +Karamay City,74,267,670.1,8.6,84.86360931,45.59651184,354,5185,0,43.05,16.57 +Kunming City,62,1189.5,3970,16.72054795,102.704567,25.04384422,1930,2170,0,40.64,158.76 +Kunshan City,55,1615,3080,17.51369863,120.980736,31.385597,7,2893,0,43.75,69.25 +Lhasa City,33,377.4,389.46,10.06027397,91.17211,29.652491,3660,1422,0,36.14,23.76 +Laiwu City,138,450,755,14.53013699,117.66173,36.205116,197,1003,0,44.94,22.12 +Laixi City,118,672,526.02,13.60136986,120.51769,36.889084,70,729,0,41.98,15.51 +Laizhou City,106,708,717,14.73150685,119.942275,37.177129,48,1018,1,42.76,22.84 +Lanzhou City,75,267,2095.99,12.33972603,103.7500534,36.06803894,1525,7540,0,25.8,98.36 +Langfang City,181,1742,2056,13.83287671,116.6898575,39.51511002,18,721,0,36,27 +Leshan City,87,1066.8,1301.2,19.2890411,103.7514038,29.56822395,355,1981,0,33.12,18.99 +Lijiang City,26,665,290,14.42739726,100.2342529,26.87666512,2384,6538,0,40.63,13.79 +Lishui City,32,1229,1102.34,19.87260274,119.9165573,28.44883728,74,1296,0,40.97,16.1 +Liyang City,99,1615,738.15,16.99178082,119.48421,31.416911,7,1896,0,43.08,7.18 +Lianyungang City,65,878,2160.64,14.17671233,119.1668015,34.60517883,4,1463,1,40.92,36.25 +Liangshan Prefecture,31,1433,1314.72,15.3,102.267712,27.88157,1523,75,0,36,23 +Liaoyang City,62,601,1025,10.46164384,123.1617432,41.26513672,27,391,0,33,31 +Liaocheng City,120,458,2663.62,14.74931507,115.9884262,36.44943237,37,2088,0,38.81,40.74 +Nyingchi City,35,617,105,3,94.36149,29.649128,2994,1905,0,36.07,1.53 +Lin'an City,86,1224,466.28,17.04520548,119.724733,30.233873,39,1114,0,36.79,12.77 +Lincang City,48,1000,502.1,19.20410959,100.0878067,23.8799305,1493,5543,0,39,6.36 +Linfen City,76,1535,1161.1,15.19589041,111.5141678,36.08282471,450,215,0,33,14 +Linyi City,157,672,3763.2,15.19452055,118.3379593,35.06945038,68,1633,1,44.93,86.2 +Liuzhou City,41,1229,2298.62,21.88493151,109.4028091,24.31040573,91,3517,0,42.95,49.83 +Lu'an City,70,308,1143.4,13.77260274,116.4927902,31.75352287,75,3600,0,44.5,19.28 +Longyan City,19,1469,1738.45,21.83013699,117.0303879,25.10970306,329,2089,0,43.21,23.84 +Loudi City,76,142,1291.38,18.14931507,111.9938965,27.74133492,120,7814,0,37.51,14.5 +Luzhou City,106,1290.6,1353.4,19.07123288,105.4378433,28.88199425,262,2926,0,39.89,28.64 +Luoyang City,110,602.3,3508.75,15.5109589,112.4247971,34.66804123,147,7278,0,32.41,71.39 +Luohe City,85,831,992.85,15.70410959,114.0410919,33.57250977,62,5283,0,34.39,22 +Lvliang City,64,1071,955.8,11.66575342,111.13,37.52,951,181,0,33,23 +Ma’anshan City,89,2137,1365.3,17.05068493,118.4807129,31.72492409,19,4105,0,40.05,22.91 +Maoming City,26,1638,2445.6,24.59726027,110.8888474,21.67071724,30,4263,1,24.75,22.12 +Meishan City,110,990.2,1029.86,18.92191781,103.83146,30.050497,415,1711,0,35.09,26.17 +Meizhou City,12,1229,955.09,22.7739726,116.1079407,24.31450081,87,1191,0,38.93,21.57 +Mianyang City,88,717.6,1700.33,18.31232877,104.7485504,31.45634842,473,2673,0,38.7,34.24 +Miaoli County,25,1700,623,22,120.8214265,24.560159,60,310,1,46,22 +Mudanjiang City,48,361,1186.3,5.952054795,129.5984955,44.58392334,234,68,0,33,29 +Nagqu Prefecture,47,262,95.9,-2.5,92.051239,31.476202,4505,1,0,36,23 +Nanchang City,61,2207.3,4000.01,19.2109589,115.8999176,28.67599106,16,7536,0,44.09,63.69 +Nanchong City,91,1093.8,1516.2,19.20410959,106.0816269,30.79582214,338,2619,0,43.1,36.95 +Nanjing City,97,1768.9,9720.77,16.77260274,118.7727814,32.04761505,13,1462,0,38.5,238.68 +Nanning City,28,1229.4,3410.09,23.14109589,108.3117676,22.80654335,79,3411,0,36.76,107.44 +Nanping City,12,1229,1339.51,21.07671233,118.1691208,26.64484215,87,1769,0,42.73,12.37 +Nantong City,55,684,6148.4,16.48767123,120.8555679,32.01506805,2,4082,1,42.97,59.88 +Nantou Ccounty,64,1893,590,19,120.9718638,23.9609981,446,124,1,44,22 +Nanyang City,111,602,2875.02,16.27671233,112.5375137,32.99901962,130,2506,0,36.08,49.25 +Neijiang City,89,1128,1198.58,18.93561644,105.0534363,29.57756805,335,2307,0,35.43,16.43 +Ningbo City,51,1651,8011.5,18.1630137,121.5412827,29.87066841,10,2444,1,39.16,111 +Nujiang Prefecture,19,617,113.45,11,98.865859,25.777128,791,36,0,36,23 +Panzhihua City,41,1053.4,925.18,21.85068493,101.6984177,26.55479813,1156,2102,0,39.8,17.91 +Panjin City,119,878,1267.9,10.42465753,122.0476303,41.18847656,5,352,1,29,24 +Penglai City,78,581,471.3,13.72191781,120.758848,37.810661,18,1115,1,42.08,7.3 +Penghu County,24,1670,136,23.4,119.5793157,23.5711899,21,807,1,41,24 +Pingdingshan City,103,689,1705.78,16.09589041,113.3001938,33.74362946,124,3620,0,36.68,30.3 +Pingdu City,69,708,779.3,13.92054795,119.98842,36.776357,52,660,1,44.05,36.67 +Pingtung County,51,1966,946,25,120.5487597,22.5519759,17,302,1,46,24 +Pingxiang city,55,900,912.39,14.45068493,113.841423,27.63298988,96,5258,0,40.04,15.84 +Putian City,31,1775,1655.16,21.90273973,119.0103226,25.43813705,15,2570,1,41.87,42.57 +Puyang City,77,458,1333.64,14.71643836,115.0149536,35.70189667,55,3384,0,38.4,21.9 +Pu'er City,32,477,120.03,20.63561644,100.9752121,22.79548073,1330,58,0,36,7.95 +Qiqihar City,52,136,1270,3.2,123.9592667,47.34136963,149,125,0,33,31 +Qianjiang Country,90,1200,557,16.1,112.899762,30.402167,32,1333,0,36.61,12.78 +Qinhuangdao City,163,878,1250.44,11.19041096,119.5982971,39.92430878,4,412,1,29,25 +Qingdao City,65,720,9300.07,14.21369863,120.3581696,36.13386154,6,1723,1,42.9,184.97 +Qingyang County,23,617,79.95,16.1,117.80361,30.47926,772,225,0,36,14 +Qingyuan City,22,878,1285,20.7,113.0212631,23.71959686,14,1940,0,35,30.65 +Quzhou City,45,1035,1146.2,17.3,118.8691788,28.9584446,61,1790,0,41.11,27.23 +Qujing City,41,1768,1630.3,16.22876712,103.7947006,25.49616623,1847,8719,0,36.18,19.05 +Quanzhou City,18,1775,6137.74,22.26438356,118.5896378,24.91591835,3,2410,1,44.42,51.4 +Shigatse City,25,377,168,6.3,88.880583,29.266869,3844,2700,0,76.49,4.13 +Rizhao City,129,831,1670.8,14.45205479,119.4515533,35.42756271,37,1674,1,45.74,25.54 +Rongcheng City,35,1742,1022.3,13.30684932,122.486657,37.16516,17,771,1,46,15.24 +Rushan City,47,601,477.18,13.16986301,121.539764,36.919816,27,1345,1,46.49,6.43 +Sanmenxia City,180,1093,1260.55,15.29178082,111.1952591,34.78076935,377,7134,0,36.3,15.86 +Sanming City,13,1469,1713.05,21.2260274,117.6012268,26.22301292,204,1011,0,43.13,11.45 +Sanya City,21,1268,435.02,27.44794521,109.5078201,18.23404312,7,1946,1,42.9,45.77 +Xiamen City,54,1775.2,3466.01,22.39315068,118.0875168,24.45743561,2,9685,1,41.36,161.56 +Lhoka Prefecture,35,617,115,7.4,91.773134,29.237137,3572,4,0,36,23 +Shantou City,50,1738,1850.01,23.70547945,116.6837997,23.36269188,8,4168,1,58.11,78.76 +Shanwei City,59,1775,780,22,115.3640137,22.77868652,8,825,1,35.42,9.13 +Shangqiu City,65,831,1803.93,15.05479452,115.6471863,34.44358444,50,9364,0,36.06,28.17 +Shanghai Municipality,46,1651.5,24964.99,17.91643836,121.4692688,31.23817635,16,3804,1,34,613.85 +Shangrao City,30,1229,1650.81,18.99726027,117.9634018,28.45326614,79,7649,0,46.46,27.76 +Shaoguan City,12,1700,1149.98,21.3109589,113.6053925,24.80877686,57,454,0,41.6,21.76 +Shaoyang city,77,142,1387,17.90410959,111.4773788,27.25023651,222,9112,0,35.1,17.42 +Shaoxing City,104,2127,3366.7,18.11506849,120.5739288,30.01093102,12,2935,0,48.33,82.02 +Shenzhen City,37,1966,17502.99,24.4109589,114.110672,22.55639648,1,5697,1,34.56,574.83 +Shennongjia Forestry District,50,900,22.5,12,110.680447,31.743483,887,24,0,36,23 +Shenyang City,90,573.6,7280,9.183561644,123.4116821,41.7966156,51,640,0,36,31 +Shiyan City,69,760,1300,16.49452055,110.7827988,32.65213013,290,1376,0,29.72,38.7 +Shijiazhuang City,182,534.5,5440.6,14.8739726,114.4897766,38.04512787,80.5,681,0,36,31 +Shizuishan City,85,480,482.4,8.4,106.3820572,39.02428055,1102,4096,0,40.64,21.77 +Shouguang City,127,547,806.9,14.84657534,118.790739,36.85576,23,1043,1,44,12.6 +Shuozhou City,96,440,910,8.663013699,112.4232712,39.31313324,1087,165,0,33,23 +Suzhou City(Jiangsu),73,2055,14504.07,17.46986301,120.6187286,31.31645203,6,1251,0,34,214.33 +Suizhou City,106,672,785,16.28356164,113.36982,31.715105,71,1843,0,38.44,21.29 +Suining City,95,1127.1,915.81,18.46849315,105.5697098,30.50339317,280,2256,0,33.64,60.91 +Taipei City,63,1775,5412.65,22.4,121.5654177,25.0329694,11,9952,1,41,351.93 +Taitung County,15,2383,254,22,121.0713702,22.7972447,143,63,1,44,22 +Tainan City,55,1638,2378.27,23.8,120.2270277,22.9997281,28,860,1,45,127.33 +Taichung City,30,2383,3774.95,22.5,120.6736482,24.1477358,90,1240,1,44,106 +Taizhou City,55,581,3558.13,16.8,121.43,28.68,14,1378,1,38.55,77.81 +Taicang City,47,1742,625.64,17,121.13055,31.457735,4,581,0,33,14 +Taiyuan City,138,401.5,2735.34,11.8890411,112.5693512,37.87111282,788,618,0,36,23 +Tai'an City,151,537,3240,9.952054795,117.1241074,36.1871109,167,1658,0,36.64,31.71 +Taizhou City,70,1742,3655.53,18.89863014,119.91124,32.495872,8,801,0,36,29 +Tangshan City,166,547,6225.3,12.55890411,118.2017288,39.62533951,25.9,591,1,34,24 +Taoyuan City,21,2383,2990.09,22.6,121.3009798,24.9936281,111,1727,1,44,93.75 +Tianjin Municipality,121,574.4,16538.19,14.89863014,117.2034988,39.13111877,3.3,1298,1,34,421.66 +Tianmen City,62,1200,440,16.1,113.165862,30.653061,27,928,0,37.87,16.43 +Tieling City,79,708,745,9.460273973,123.844429,42.29558182,57,205,0,33,31 +Tongliao City,82,440,1877.27,8.561643836,122.2603302,43.61156082,177,52,0,36,31 +Tongchuan city,98,480,324.54,12.88082192,109.0572815,35.07545853,717,7336,0,43.49,13.63 +Tongling City,58,1584,721.3,17.45616438,117.813179,30.92524719,26,2632,0,44.86,10.74 +Wafangdian City,122,592,935.9,11.26438356,121.979603,39.627114,120,270,1,44,33 +Weihai City,49,1670,3001.57,13.90821918,122.1116867,37.50076294,22,1473,1,41.1,41.35 +Weifang City,146,288.3,5100,14.74931507,119.1068497,36.7040863,31,1082,0,42.04,47.43 +Weinan City,117,537,1469.08,15.60410959,109.5008392,34.50152588,355,2022,0,38.69,20.39 +Wenzhou City,50,1742,4619.84,19.61506849,120.6502914,28.01647568,9,774,1,41,24 +Wendeng City,41,708,829.7,13.39726027,122.057988,37.193882,45,360,1,46,25 +Wenshan Prefecture,60,1000,190.1,17.5,104.23251,23.386305,1259,5216,0,23.68,10.33 +Wuhai City,113,480,258.92,10.5,106.8148727,39.67420197,1092,317,0,36,23 +Urumqi City,170,408.9,2680,9.1,87.60611725,43.79093933,836,2123,0,40.3,138.55 +Wuxi City,69,2055,8518.26,18.25616438,120.2991333,31.57723045,5,1360,0,36,198.34 +Wuhu City,56,1742,2457.3,17.33104396,118.3598328,31.33449554,8,610,0,36,27 +Wujiang City,55,1742,1540,17.54246575,120.645158,31.138677,7,652,0,36,28 +Wuhan City,128,1427.5,10905.06,17.15068493,114.2919388,30.56751442,15,4413,0,41.99,330.68 +Xi’an city,113,558,5810.03,15.48493151,108.949028,34.26168442,385,7820,0,42.62,332.34 +Xining City,72,308.2,1131.62,7.283561644,101.7874527,36.60944748,2263,3388,0,38.92,55.8 +Sipsongpanna Prefecture,69,1893.7,335.9,20,100.79715,22.00881,555,59,0,36,23 +Xilin Gol League,18,250,1002.6,1.5,116.0482221,43.933454,987,5,0,33,22 +Xiantao city,78,1200,590,16.3,113.423482,30.360882,30,1667,0,22.79,18.89 +Xianning City,85,1584.2,1035,17.62739726,114.2687378,29.89432716,28,2493,0,21.57,15.26 +Xianyang City,127,537,2155.91,14.07260274,108.7101288,34.33721542,383,2007,0,40.27,30.96 +HongKong SAR,37,2383,17900,22.8,114.109497,22.396428,321,25900,1,37,594.48 +Shangri-la City,24,617.6,100.9,5.5,99.7,27.83,3459,2034,0,8.67,4.64 +Xiangtan City,67,142,1703.1,18.51232877,112.9150238,27.87335014,68,4851,0,36.23,30.42 +Xiangyang City,118,672,3400,15.1,112.122426,32.009016,70,3099,0,41.18,34.33 +Xiaogan City,101,1200,1460,16,113.9113312,30.92845535,24,5553,0,34.45,16.4 +Xinzhou City,65,1450,680,10.50684932,112.7315521,38.39920807,790,125,0,33,23 +New Taipei City,16,1775,5470.33,22,121.4627868,25.0169826,15,1935,1,41,24 +Xinyu City,47,1700,946.8,18.7739726,114.9293823,27.80654716,63,2041,0,46.71,16.1 +Hsinchu City,21,1700,686,22.2,120.9674798,24.8138287,25,4199,1,46,24 +Hsinchu County,22,1700,714,22.2,121.0177246,24.8387226,30,380,1,46,24 +Xinyang City,106,537,1877.75,16.60821918,114.0677185,32.13063049,106,2065,0,39.9,27.11 +Xingtai City,175,534,1668.1,14.86712329,114.4950867,37.0655899,76.8,586,0,36,27 +Hinggan League,36,1469,502.31,5,122.037657,46.082462,274,27,0,33,29 +Suqian City,82,831,2126.19,15.65890411,118.29706,33.958302,30,582,0,36,27 +Suzhou City(Anhui),87,910,1235.8,16.21917808,116.9701538,33.6401329,28,566,0,33,29 +Xuzhou City,116,831.6,5319.88,15.50547945,117.1856079,34.26752853,36,770,0,36,31 +Xuchang City,82,672,2170.6,15.28767123,113.8215866,34.02685928,71,5196,0,38.05,23.33 +Xuancheng City,54,1742,971.5,17.07808219,118.75,30.95,15,210,0,33,14 +Yaan City,81,1388.6,517.72,17.98356164,102.9826965,29.98229408,580,1346,0,40.8,15.19 +Yantai City,89,574,6446.08,13.8,121.3799362,37.53561401,4,1962,1,44.38,119.91 +Yanan City,108,510.4,1198.63,10.89041096,109.471283,36.59387207,1070,6300,0,40.78,13.21 +Yancheng City,88,684,4212.5,15.73561644,120.1351776,33.38982773,1,496,1,44,24 +Yangzhou City,97,1742,4016.84,16.32876712,119.4368362,32.39188766,12,676,0,36,31 +Yangjiang City,24,1966,1256,23.78082192,111.9578934,21.84523392,4,1180,1,44.71,22.82 +Yangquan City,102,480,598.85,11.8260274,113.5742569,37.86065674,672,314,0,33,23 +Yibin City,69,866.3,1529.9,19.68767123,104.6168671,28.77025604,321,707,0,38.22,29.21 +Yichang City,104,1651.5,3384.8,16.84931507,111.2852707,30.70395279,59,1709,0,39.91,31.43 +Yichun City,36,1469,1621.02,17.2,114.3746109,27.79557419,96,6517,0,50.56,16.75 +Yilan County,21,1268,565,25,121.7377502,24.7021073,13,214,1,41,24 +Yixing City,57,1200,1320,16.54520548,119.823308,31.340637,34,530,0,33,14 +Yiwu City,78,672,1046,17,120.07514,29.306756,73,669,0,33,14 +Yiyang City,66,866,1354.41,15.28630137,112.3340683,28.60197067,81,5632,0,39.47,17.19 +Yinchuan City,81,227,1480.73,11.31643836,106.2719421,38.46800995,1111,799,0,40.89,51.93 +Yingtan City,41,1700,639.26,19.19452055,117.0302811,28.2455864,40,325,0,36,14 +Yingkou City,88,878,1500,10.53424658,122.2241516,40.66835022,5,492,1,44,24 +Yongzhou City,59,1901,1418.18,19.13835616,111.6121979,26.2112999,103,5372,0,43.34,18.85 +Yuxi City,46,892,1245.7,17.5260274,102.5332336,24.35497284,1625,3397,0,42.21,10.86 +Yueyang City,77,1638.4,2886.28,18.23972603,113.0980682,29.37461853,37,4581,0,35.3,29.28 +Yunfu City,15,1469,710.07,21.5,112.03999,22.933193,150,3118,0,34.84,10.76 +Yunlin County,48,1700,715,22,120.4313373,23.7092033,28,542,1,46,24 +Yuncheng City,119,592.4,1173.54,15.21780822,110.9911499,35.01391602,370,374,0,33,30 +Zaozhuang City,148,534,2030,15.24657534,117.556282,34.87264633,89,2784,0,44.92,46.25 +Zhanjiang City,29,1674,2450,24.49452055,110.3992233,21.19499779,17,7718,1,40.5,37.63 +Zhangjiagang City,73,581,2229.82,17.13287671,120.556005,31.875572,7,1152,1,44,24 +Zhangjiajie City,64,1071,447.7,18.45890411,110.47,29.13,164,1632,0,41.66,12.43 +Zhangjiakou City,103,440,1363.54,9.624657534,114.8787766,40.81744003,716,120,0,33,23 +Zhangqiu City,103,401,870.8,15.00273973,117.526228,36.681259,143,737,0,33,29 +Changhua County,38,1775,1356,22.6,120.5161352,24.0517963,14,1200,1,46,24 +Zhangzhou City,59,1670,2767.45,23.28493151,117.6530914,24.51816368,19,388,1,45,24 +Changchun City,106,530.5,5530.03,7.157534247,125.3154297,43.89256287,219,382,0,36,31 +Changsha City,76,1638.4,8510.13,18.50547945,112.9812698,28.20082474,63,2831,0,37.6,203.04 +Changzhi City,91,592,1137.1,11.65342466,113.1055679,36.18191147,931,247,0,33,23 +Zhaoyuan City,117,480,639.84,11.5,120.434071,37.355469,85,1464,1,43.04,6.86 +Shaotong City,27,665,709.2,13.94657534,103.7149277,27.34227943,1932,4949,0,24.05,14.6 +Zhaoqing City,34,2137,1970.01,23.75616438,112.4514084,23.05788231,17,1507,0,39.64,27.76 +Zhenjiang City,82,1224,3502.48,16.74931507,119.4442978,32.20589828,23,827,0,36,31 +Zhengzhou City,152,689.2,7315.19,15.98219178,113.6500473,34.7570343,110,15055,0,41.87,206.96 +Zhongshan City,45,1966,3010.03,24.05616438,113.3714523,22.52685356,13,2809,1,44.71,24.97 +Chongqing Municipality,66,1450.9,15719.72,19.7260274,106.5103378,29.55817604,238,1904,0,40.3,440.03 +Zhoushan City,25,1450,1094.7,17.66575342,122.1016083,30.02004242,3,800,1,46,24 +Zhoukou City,88,910,2082.38,16.81369863,114.6372528,33.62804031,49,3808,0,42.46,14.86 +Zhuhai City,41,1966,2024.98,24.31369863,113.5682602,22.27258873,7,2472,1,45.1,81.5 +Zhuzhou City,52,1638,2335.11,18.75479452,113.1520615,27.85422325,46,1287,0,34.7,38.19 +Zhuji City,104,1742,1026.78,17.8260274,120.246863,29.708692,16,501,0,36,29 +Zhumadian City,79,1388,1807.09,15.94657534,114.0356903,32.97904205,84,2539,0,39.46,18.86 +Ziyang City,86,807.8,1270.4,16.18219178,104.65019,30.122671,367,1787,0,38.67,18.28 +Zibo City,116,288,4130.2,14.57671233,118.0560532,36.7935791,38,782,0,36,31 +Zigong City,118,994.8,1143.11,19.43287671,104.7763519,29.36772156,311,1557,0,40.2,35.47 +Zunyi City,60,1000,2168.34,16.99178082,106.9293976,27.69538689,865,3581,0,44.06,34.51 diff --git a/report/images/feature_coefficients.png b/report/images/feature_coefficients.png new file mode 100644 index 0000000..e97c1db Binary files /dev/null and b/report/images/feature_coefficients.png differ diff --git a/report/images/feature_correlation.png b/report/images/feature_correlation.png new file mode 100644 index 0000000..7cff014 Binary files /dev/null and b/report/images/feature_correlation.png differ diff --git a/report/images/gd_initialization.png b/report/images/gd_initialization.png new file mode 100644 index 0000000..550ac63 Binary files /dev/null and b/report/images/gd_initialization.png differ diff --git a/report/images/gd_learning_rates.png b/report/images/gd_learning_rates.png new file mode 100644 index 0000000..c54d677 Binary files /dev/null and b/report/images/gd_learning_rates.png differ diff --git a/report/images/gd_scaling_comparison.png b/report/images/gd_scaling_comparison.png new file mode 100644 index 0000000..4c7058d Binary files /dev/null and b/report/images/gd_scaling_comparison.png differ diff --git a/report/images/lwlr_k_analysis.png b/report/images/lwlr_k_analysis.png new file mode 100644 index 0000000..a9a5e31 Binary files /dev/null and b/report/images/lwlr_k_analysis.png differ diff --git a/report/images/model_comparison.png b/report/images/model_comparison.png new file mode 100644 index 0000000..fe57b80 Binary files /dev/null and b/report/images/model_comparison.png differ diff --git a/report/机器学习.md b/report/机器学习.md new file mode 100644 index 0000000..16dc782 --- /dev/null +++ b/report/机器学习.md @@ -0,0 +1,200 @@ +# 空气质量指数(AQI) 预测与影响因素分析实验报告 + +## 摘要 + +本实验旨在利用全国主要城市的相关环境与经济数据,预测空气质量指数 (AQI) 并分析其关键影响因素。我们采用了多种回归分析方法,包括基于梯度下降的线性回归、正则化回归(Lasso 与 Ridge)以及局部加权线性回归 (LWLR)。通过对比实验,我们发现局部加权线性回归在 $k=2.0$ 时表现最佳,RMSE 达到 30.36。实验同时揭示了数据预处理(尤其是标准化)对于梯度下降算法收敛的重要性。特征分析表明,纬度、温度和海拔是影响 AQI 的最主要因素,显示出空气质量受地理和气象条件的制约显著大于经济活动指标。 + +--- + +## 1. 引言 + +空气质量指数 (Air Quality Index, AQI) 是衡量空气清洁状况的重要指标,直接关系到公众健康。准确预测 AQI 并理解其背后的驱动因素,对于环境治理和政策制定具有重要意义。本实验基于包含323个城市样本的数据集,通过机器学习方法构建预测模型,并深入探讨不同算法的特性及适用场景。 + +--- + +## 2. 数据探索与预处理 + +### 2.1 数据集概况 + +实验数据来源于 `AQIDataset.csv`,包含 323 个有效样本。每个样本代表一个城市,包含以下 10 个特征变量: + +* **地理特征**: Longitude (经度), Latitude (纬度), Altitude (海拔), Coastal (沿海标识) +* **气象特征**: Temperature (温度), Precipitation (降水量) +* **经济与社会特征**: GDP, PopulationDensity (人口密度), GreenCoverageRate (绿化覆盖率), Incineration (焚烧量) + +目标变量为 **AQI**。 + +### 2.2 数据清洗与预处理细节 + +在进行模型训练前,我们对数据进行了严格的清洗和预处理,这是保证模型性能的关键步骤。 + +1. **缺失值处理 (Handling Missing Values)**: + * 原始数据中存在部分缺失值。考虑到样本量尚可(300+),且缺失值的填充(如均值填充)可能引入噪声,我们采取了**直接删除 (Drop)** 的策略,保留了完整性较高的数据子集。 + * 最终有效样本数:323。 + +2. **特征选择 (Feature Selection)**: + * `City` 字段为文本标识符,不包含定量信息,直接剔除,不作为输入特征。 + +3. **特征缩放 (Feature Scaling)**: + * 由于不同特征的量纲差异巨大(例如 GDP 可达数万,而 Precipitation 仅为数百),直接使用基于梯度的优化算法会导致损失函数的等高线呈细长的椭圆状,使得梯度下降路径呈“之”字形震荡,收敛极慢甚至发散。 + * 我们实现了两种缩放方法进行对比: + * **归一化 (Min-Max Scaling)**: 将数据映射到 $[0, 1]$ 区间。 + $$ x_{norm} = \frac{x - x_{min}}{x_{max} - x_{min}} $$ + * **标准化 (Z-score Standardization)**: 将数据转换为均值为 0,标准差为 1 的分布。 + $$ x_{std} = \frac{x - \mu}{\sigma} $$ + * **实验发现**: 标准化在处理异常值和保持数据分布特性方面表现更优,后续主要实验均基于标准化数据进行。 + +4. **数据集划分 (Data Splitting)**: + * 采用 **10折交叉验证 (10-fold Cross Validation)**。将数据集随机分为 10 份,轮流取 9 份作为训练集,1 份作为测试集。 + * 最终结果取 10 次实验的平均值,以减少因数据划分不同带来的随机误差,评估结果更具稳健性。 + +--- + +## 3. 实验方法论 + +### 3.1 基于梯度下降的线性回归 (Gradient Descent Linear Regression) + +我们手动实现了线性回归模型,通过最小化均方误差 (MSE) 来寻找最优参数 $\theta$。 + +* **假设函数**: $h_\theta(x) = \theta^T x + b$ +* **损失函数**: $J(\theta) = \frac{1}{2m} \sum_{i=1}^m (h_\theta(x^{(i)}) - y^{(i)})^2$ +* **参数更新 (梯度下降)**: + $$ \theta_j := \theta_j - \alpha \frac{1}{m} \sum_{i=1}^m (h_\theta(x^{(i)}) - y^{(i)}) x_j^{(i)} $$ + 其中 $\alpha$ 为学习率。 + +### 3.2 正则化回归 (Regularized Regression) + +为了防止过拟合和处理多重共线性,我们引入了正则化项: + +* **Ridge Regression (岭回归)**: 加入 L2 正则项 $\lambda \sum \theta_j^2$。倾向于让参数变小但不为零,适合处理特征间存在相关性的情况。 +* **Lasso Regression**: 加入 L1 正则项 $\lambda \sum |\theta_j|$。倾向于产生稀疏解(即让部分参数变为 0),具有自动特征选择的功能。 + +### 3.3 局部加权线性回归 (Locally Weighted Linear Regression, LWLR) + +传统的线性回归是全局参数模型,假设数据整体呈线性关系。LWLR 是一种非参数方法,针对每个查询点 $x$,赋予其附近的训练样本更大的权重,然后进行加权线性回归。 + +* **权重矩阵 $W$**: 对角矩阵,对角线元素 $w^{(i)}$ 为样本 $x^{(i)}$ 对应的权重。我们使用高斯核函数: + $$ w^{(i)} = \exp\left(-\frac{||x^{(i)} - x||^2}{2k^2}\right) $$ + 其中 $k$ 为带宽 (bandwidth) 参数,控制权重的衰减速度。$k$ 越小,只有非常近的样本才有权重(模型复杂,易过拟合);$k$ 越大,权重分布越均匀(退化为普通线性回归)。 +* **闭式解**: $\theta = (X^T W X)^{-1} X^T W y$ + +--- + +## 4. 实验结果与深度分析 + +本章节通过一系列可视化图表,深入探讨了算法参数、特征选择及模型性能的差异。 + +### 4.1 梯度下降算法的挑战与调优 + +#### 4.1.1 "梯度爆炸"与特征缩放的重要性 + +在初步实验中,我们直接使用原始数据进行梯度下降训练。结果显示 RMSE 为 `Infinity` (无穷大),且发生数值溢出 (`RuntimeWarning: overflow`). + +![Scaling Comparison](images/gd_scaling_comparison.png) + +**图 4-1: 特征缩放方法对 RMSE 的影响对比** + +* **图表解读**: + * 最左侧柱状图(Raw)代表原始数据,其 RMSE 值因过大而无法在常规坐标轴中显示(数值溢出)。这直观展示了在特征尺度差异巨大的情况下(如 GDP vs 温度),基于梯度的算法会遭遇灾难性的发散问题。 + * 中间(Normalized)和右侧(Standardized)分别代表归一化和标准化。可以看出,**标准化 (Standardized)** 取得了最低的 RMSE (32.84),优于归一化 (36.52)。 +* **深度分析**: + * 原始数据的方差差异导致 Hessian 矩阵条件数极大,损失曲面呈现狭长山谷状。即使步长极小,优化路径也容易在山谷两侧剧烈震荡,导致发散。 + * 标准化相比归一化更优,原因在于标准化不限制数据区间,对异常值(outliers)的鲁棒性更强。对于空气质量这种可能出现极端值的自然数据,Z-score 标准化更能保留数据的分布信息。 + +#### 4.1.2 学习率 ($\alpha$) 的敏感性分析 + +![Learning Rates](images/gd_learning_rates.png) + +**图 4-2: 不同学习率下的 Loss 收敛曲线 (Log Scale)** + +* **图表解读**: + * **$\alpha=0.1$ (蓝色)**: 曲线下降最快,但在极短迭代后趋于平缓。虽然初期收敛快,但在 10-fold CV 中观测到部分折出现震荡现象,稳定性稍逊。 + * **$\alpha=0.01$ (橙色)**: 曲线平滑下降,收敛速度适中且最终 Loss 值与 0.1 相当。这是最理想的学习率,兼顾了速度与稳定性。 + * **$\alpha=0.001$ (绿色) & $0.0001$ (红色)**: 曲线下降斜率较小,虽然最终也能收敛,但需要更多的迭代次数(Epochs)。在有限算力下,过小的学习率会导致欠拟合。 +* **深度分析**: + * 该图采用了对数坐标轴 (Log Scale),清晰展示了不同数量级学习率的收敛行为差异。 + * 实验表明,对于标准化后的数据,0.01 是一个安全且高效的“黄金学习率”。 + +#### 4.1.3 初始化策略的影响 + +![Initialization](images/gd_initialization.png) + +**图 4-3: 全零初始化 vs 随机初始化的 Loss 曲线对比** + +* **图表解读**: + * 图中两条曲线(Zero vs Random)几乎完全重合。 +* **深度分析**: + * 这说明对于凸优化问题(线性回归的 MSE 损失函数是严格凸的),只要学习率适当,初始值的选择对最终收敛结果影响微乎其微。 + * 这与非凸优化(如神经网络)截然不同,证明了线性回归模型的稳健性。 + +### 4.2 局部加权回归 (LWLR) 的带宽选择 + +![LWLR Analysis](images/lwlr_k_analysis.png) + +**图 4-4: LWLR 带宽参数 k 对 RMSE 的影响** + +* **图表解读**: + * 横轴为带宽 $k$ (对数坐标),纵轴为 RMSE。 + * 曲线呈现典型的“U型”趋势: + * **左侧 ($k < 1.0$)**: RMSE 极高。这是因为 $k$ 过小时,权重矩阵过于稀疏,导致局部样本不足,产生过拟合甚至矩阵奇异问题。 + * **中间 ($k=2.0$)**: 曲线达到最低点 (**RMSE $\approx$ 30.36**)。这是偏差-方差权衡 (Bias-Variance Tradeoff) 的最佳点。 + * **右侧 ($k > 2.0$)**: RMSE 缓慢上升。随着 $k$ 增大,局部加权退化为全局线性回归,模型欠拟合(High Bias)。 +* **结果分析**: + * 最佳 $k=2.0$ 表明,空气质量的分布具有一定的平滑性,但不是全局线性的。一个城市 AQI 的预测需要参考周边一定范围内的样本,这个范围(由 $k$ 定义)既不能太小(受个别噪声干扰),也不能太大(失去地域特性)。 + +### 4.3 影响 AQI 的关键因素解析 + +#### 4.3.1 特征相关性热力图 + +![Feature Correlation](images/feature_correlation.png) + +**图 4-5: 特征相关性矩阵热力图** + +* **图表解读**: + * 热力图展示了各特征之间的线性相关系数。颜色越红代表正相关,越蓝代表负相关。 + * **AQI 与 Latitude (纬度)**: 呈现较强的正相关 (0.xx),说明北方城市 AQI 普遍高于南方。 + * **AQI 与 Precipitation (降水)**: 呈现负相关,验证了降水的清洁作用。 + * **特征共线性**: 某些特征间相关性较高(如 Longitude 与 Latitude),这解释了为何 Ridge 回归(处理多重共线性)在后续对比中表现稳定。 + +#### 4.3.2 回归系数条形图 + +![Feature Coefficients](images/feature_coefficients.png) + +**图 4-6: Ridge 回归下的特征标准化系数** + +* **图表解读**: + * 该图展示了各特征对 AQI 的贡献度(标准化系数,消除了量纲影响)。 + * **绝对值 Top 3**: Latitude (纬度) > Temperature (温度) > Altitude (海拔)。 + * **正负影响**: + * **Latitude (+)**: 纬度越高,AQI 越高(污染越重)。 + * **Temperature (-)**: 温度越高,AQI 越低(空气越好)。这可能与冬季低温导致的大气层结稳定(逆温)及供暖排放有关。 + * **Precipitation (-)**: 降水有助于降低 AQI。 +* **结果分析**: + * 自然地理因素(纬度、气象、海拔)的影响力显著超过了社会经济因素(GDP、焚烧量)。这意味着,尽管人为排放是污染源,但自然条件的扩散能力决定了最终的空气质量表现。 + +### 4.4 模型综合性能对比 + +![Model Comparison](images/model_comparison.png) + +**图 4-7: 四种模型的 RMSE 和 MAE 对比** + +* **图表解读**: + * 蓝色柱状图代表 RMSE,橙色代表 MAE。柱子越低越好。 + * **LWLR (最右侧)** 明显低于其他三个全局模型。其 RMSE 为 30.36,相比 Gradient Descent (32.84) 降低了约 **7.6%**。 + * Lasso 和 Ridge 与普通 GD 差异不大,说明正则化带来的收益有限。 +* **结果分析**: + * LWLR 的胜利有力证明了中国空气质量分布的 **空间非平稳性 (Spatial Non-stationarity)**。不同区域的 AQI 生成机制存在差异,通过局部加权,LWLR 实际上构建了无数个“局部线性模型”,从而比单一的全局模型更精准地捕捉了数据规律。 + +--- + +## 5. 结论与展望 + +本实验通过构建多种回归模型,成功实现了对城市 AQI 的预测。主要结论如下: + +1. **数据预处理是核心**: 对于涉及距离计算 (LWLR) 和梯度优化 (GD) 的算法,**Z-score 标准化**是不可或缺的步骤。原始数据的尺度差异直接导致了算法的失效。 +2. **局部模型优于全局模型**: LWLR 的优越表现证明了空气质量分布具有很强的地域性特征。不同区域(如沿海 vs 内陆,南方 vs 北方)的 AQI 生成机制可能不同,全局单一线性模型难以通过一套参数拟合所有城市。 +3. **自然因素主导**: 尽管经济指标 (GDP, 工业焚烧) 对 AQI 有影响,但**地理位置 (经纬度、海拔)** 和 **气象条件 (温湿)** 仍然是决定一个城市空气质量的基础性因素。 + +**未来改进方向**: +* 尝试非线性模型(如决策树、随机森林或神经网络)以进一步降低 RMSE。 +* 引入更多时间维度的特征(如季节性因素),构建时空预测模型。 diff --git a/src/__pycache__/data_processing.cpython-313.pyc b/src/__pycache__/data_processing.cpython-313.pyc new file mode 100644 index 0000000..c07f3a8 Binary files /dev/null and b/src/__pycache__/data_processing.cpython-313.pyc differ diff --git a/src/__pycache__/models.cpython-313.pyc b/src/__pycache__/models.cpython-313.pyc new file mode 100644 index 0000000..279943c Binary files /dev/null and b/src/__pycache__/models.cpython-313.pyc differ diff --git a/src/data_processing.py b/src/data_processing.py new file mode 100644 index 0000000..02868d5 --- /dev/null +++ b/src/data_processing.py @@ -0,0 +1,67 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import KFold + +def load_and_preprocess_data(filepath): + """ + Loads the AQI dataset, performs preprocessing, and returns features (X) and target (y). + """ + # Load data + df = pd.read_csv(filepath) + + # Drop City column as it is not an input variable + if 'City' in df.columns: + df = df.drop('City', axis=1) + + # Handle missing values: delete rows with missing data + df = df.dropna() + + # Ensure all columns are numeric + # Some columns might have non-numeric characters if not parsed correctly, + # but based on the csv preview, they look mostly numeric. + # "Incineration(10,000ton)" header implies numeric but let's force conversion just in case + for col in df.columns: + df[col] = pd.to_numeric(df[col], errors='coerce') + + # Drop any rows that became NaN after coercion + df = df.dropna() + + # Target variable is AQI + y = df['AQI'].values + X = df.drop('AQI', axis=1).values + feature_names = df.drop('AQI', axis=1).columns.tolist() + + return X, y, feature_names + +def get_kfold_indices(X, n_splits=10, random_state=42): + """ + Returns a generator of (train_index, test_index) for K-Fold Cross Validation. + """ + kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state) + return kf.split(X) + +def normalize_features(X): + """ + Min-Max Normalization. + Returns X_norm, min_vals, ranges + """ + min_vals = X.min(axis=0) + max_vals = X.max(axis=0) + ranges = max_vals - min_vals + # Avoid division by zero + ranges[ranges == 0] = 1 + X_norm = (X - min_vals) / ranges + return X_norm, min_vals, ranges + +def standardize_features(X): + """ + Z-score Standardization. + Returns X_std, mean, std + """ + mean = X.mean(axis=0) + std = X.std(axis=0) + # Avoid division by zero + std[std == 0] = 1 + X_std = (X - mean) / std + return X_std, mean, std + diff --git a/src/experiments.py b/src/experiments.py new file mode 100644 index 0000000..74767d8 --- /dev/null +++ b/src/experiments.py @@ -0,0 +1,293 @@ +import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.metrics import mean_squared_error, mean_absolute_error + +from data_processing import load_and_preprocess_data, normalize_features, standardize_features, get_kfold_indices +from models import LinearRegressionGD, LassoRegressionWrapper, RidgeRegressionWrapper, lwlr_predict_batch + +# Set plotting style +sns.set(style="whitegrid") +IMAGES_DIR = "report/images" +DATA_PATH = "hw1/data/AQIDataset.csv" + +def ensure_dir(directory): + if not os.path.exists(directory): + os.makedirs(directory) + +def evaluate_model(model, X, y, n_splits=10): + """ + Evaluates a model using K-Fold CV and returns RMSE and MAE list. + For GD, X should be scaled beforehand. + For LWLR, model is a function or requires special handling. + """ + rmse_scores = [] + mae_scores = [] + + # Handle LWLR separately if model is None (we'll assume special flag or pass 'lwlr') + is_lwlr = (model == 'lwlr') + + for train_idx, test_idx in get_kfold_indices(X, n_splits): + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + + if is_lwlr: + # Use default k=1.0 for generic evaluation, or handle passed k + # This function might need to be more flexible, but for general comparison: + y_pred = lwlr_predict_batch(X_test, X_train, y_train, k=1.0) + else: + # Create a new instance for each fold to avoid data leakage/state carryover + # We assume 'model' passed is a factory or class, or we reset it. + # Actually, sklearn models need fit called. Custom GD needs new instance. + # To simplify, we will pass a factory function or class and args. + if isinstance(model, type): + clf = model() + else: + # If it's an instance (sklearn), clone it or just fit (sklearn models reset on fit usually) + # For our custom GD, we need to re-init. + from sklearn.base import clone + try: + clf = clone(model) + except: + # Fallback for our custom class which doesn't inherit from sklearn base + # Filter out non-init attributes + init_params = ['learning_rate', 'n_iterations', 'initialization', 'alpha'] # generic list + params = {k: v for k, v in model.__dict__.items() if k in init_params} + clf = model.__class__(**params) + + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + + rmse = np.sqrt(mean_squared_error(y_test, y_pred)) + mae = mean_absolute_error(y_test, y_pred) + + rmse_scores.append(rmse) + mae_scores.append(mae) + + return np.mean(rmse_scores), np.mean(mae_scores) + +def run_gradient_descent_analysis(X, y): + print("Running Gradient Descent Analysis...") + + # 1. Scaling Comparison + print("Comparing scaling methods...") + scaling_results = {} + + # Raw + gd_raw = LinearRegressionGD(learning_rate=1e-7, n_iterations=1000) # Very, very small LR for raw data + try: + rmse_raw, _ = evaluate_model(gd_raw, X, y) + except ValueError: + print("Raw data caused overflow/NaN.") + rmse_raw = float('inf') + + scaling_results['Raw'] = rmse_raw + + # Normalized + X_norm, _, _ = normalize_features(X) + gd_norm = LinearRegressionGD(learning_rate=0.01, n_iterations=1000) + rmse_norm, _ = evaluate_model(gd_norm, X_norm, y) + scaling_results['Normalized'] = rmse_norm + + # Standardized + X_std, _, _ = standardize_features(X) + gd_std = LinearRegressionGD(learning_rate=0.01, n_iterations=1000) + rmse_std, _ = evaluate_model(gd_std, X_std, y) + scaling_results['Standardized'] = rmse_std + + print(f"Scaling Results (RMSE): {scaling_results}") + + # Plot Scaling Comparison + plt.figure(figsize=(8, 5)) + plt.bar(scaling_results.keys(), scaling_results.values()) + plt.title('Gradient Descent RMSE with Different Scaling Methods') + plt.ylabel('RMSE') + plt.savefig(f"{IMAGES_DIR}/gd_scaling_comparison.png") + plt.close() + + # 2. Learning Rate Analysis (using Standardized data) + print("Analyzing Learning Rates...") + lrs = [0.1, 0.01, 0.001, 0.0001] + loss_histories = {} + + for lr in lrs: + gd = LinearRegressionGD(learning_rate=lr, n_iterations=500, initialization='zeros') + gd.fit(X_std, y) # Fit on whole dataset for loss curve analysis + loss_histories[lr] = gd.loss_history + + plt.figure(figsize=(10, 6)) + for lr, history in loss_histories.items(): + plt.plot(history, label=f'lr={lr}') + plt.title('Loss Curve for Different Learning Rates (Standardized Data)') + plt.xlabel('Iteration') + plt.ylabel('MSE Loss') + plt.legend() + plt.yscale('log') # Log scale to see differences better + plt.savefig(f"{IMAGES_DIR}/gd_learning_rates.png") + plt.close() + + # 3. Initialization Analysis + print("Analyzing Initialization...") + inits = ['zeros', 'random'] + init_histories = {} + lr_best = 0.01 # Pick a stable LR + + for init in inits: + gd = LinearRegressionGD(learning_rate=lr_best, n_iterations=500, initialization=init) + gd.fit(X_std, y) + init_histories[init] = gd.loss_history + + plt.figure(figsize=(10, 6)) + for init, history in init_histories.items(): + plt.plot(history, label=f'init={init}') + plt.title(f'Loss Curve for Different Initializations (lr={lr_best})') + plt.xlabel('Iteration') + plt.ylabel('MSE Loss') + plt.legend() + plt.savefig(f"{IMAGES_DIR}/gd_initialization.png") + plt.close() + + return X_std # Return standardized data for other models + +def run_lwlr_analysis(X, y): + print("Running LWLR Analysis...") + # Use standardized data for distance calculations to be meaningful + X_std, _, _ = standardize_features(X) + + k_values = [0.1, 0.5, 1.0, 2.0, 5.0] + rmse_scores = [] + + # We need to manually do CV for each k because evaluate_model wrapper is too simple for k param + for k in k_values: + fold_rmses = [] + for train_idx, test_idx in get_kfold_indices(X_std, 10): + X_train, X_test = X_std[train_idx], X_std[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + + y_pred = lwlr_predict_batch(X_test, X_train, y_train, k=k) + fold_rmses.append(np.sqrt(mean_squared_error(y_test, y_pred))) + + avg_rmse = np.mean(fold_rmses) + rmse_scores.append(avg_rmse) + print(f"k={k}, RMSE={avg_rmse:.4f}") + + plt.figure(figsize=(8, 5)) + plt.plot(k_values, rmse_scores, marker='o') + plt.title('LWLR RMSE vs k value') + plt.xlabel('k (Bandwidth)') + plt.ylabel('RMSE') + plt.xscale('log') + plt.savefig(f"{IMAGES_DIR}/lwlr_k_analysis.png") + plt.close() + + best_k = k_values[np.argmin(rmse_scores)] + return best_k + +def run_feature_analysis(X, y, feature_names): + print("Running Feature Analysis...") + # Correlation + df_temp = pd.DataFrame(X, columns=feature_names) + df_temp['AQI'] = y + corr = df_temp.corr() + + plt.figure(figsize=(12, 10)) + sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm') + plt.title('Feature Correlation Matrix') + plt.tight_layout() + plt.savefig(f"{IMAGES_DIR}/feature_correlation.png") + plt.close() + + # Regression Coefficients (using Ridge to handle multicollinearity better) + X_std, _, _ = standardize_features(X) + ridge = RidgeRegressionWrapper(alpha=1.0) + ridge.fit(X_std, y) + coefs = ridge.model.coef_ + + plt.figure(figsize=(10, 6)) + sns.barplot(x=coefs, y=feature_names) + plt.title('Feature Coefficients (Ridge Regression)') + plt.xlabel('Coefficient Value') + plt.tight_layout() + plt.savefig(f"{IMAGES_DIR}/feature_coefficients.png") + plt.close() + + # Identify top factors + coef_series = pd.Series(np.abs(coefs), index=feature_names).sort_values(ascending=False) + print("Top influential factors (abs coefficient):") + print(coef_series) + +def run_model_comparison(X, y, best_k=1.0): + print("Running Model Comparison...") + X_std, _, _ = standardize_features(X) + + models = { + 'Gradient Descent': LinearRegressionGD(learning_rate=0.01, n_iterations=1000), + 'Lasso': LassoRegressionWrapper(alpha=0.1), # Small alpha for lasso + 'Ridge': RidgeRegressionWrapper(alpha=1.0), + 'LWLR': 'lwlr' # Placeholder + } + + results = {'Model': [], 'RMSE': [], 'MAE': []} + + for name, model in models.items(): + if name == 'LWLR': + # Manual CV for LWLR with best k + fold_rmses = [] + fold_maes = [] + for train_idx, test_idx in get_kfold_indices(X_std, 10): + X_train, X_test = X_std[train_idx], X_std[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + + y_pred = lwlr_predict_batch(X_test, X_train, y_train, k=best_k) + fold_rmses.append(np.sqrt(mean_squared_error(y_test, y_pred))) + fold_maes.append(mean_absolute_error(y_test, y_pred)) + + rmse = np.mean(fold_rmses) + mae = np.mean(fold_maes) + else: + # For GD, Lasso, Ridge + rmse, mae = evaluate_model(model, X_std, y) + + results['Model'].append(name) + results['RMSE'].append(rmse) + results['MAE'].append(mae) + print(f"{name}: RMSE={rmse:.4f}, MAE={mae:.4f}") + + results_df = pd.DataFrame(results) + + # Plot Comparison + results_df.plot(x='Model', y=['RMSE', 'MAE'], kind='bar', figsize=(10, 6)) + plt.title('Model Comparison Metrics') + plt.ylabel('Error') + plt.xticks(rotation=0) + plt.tight_layout() + plt.savefig(f"{IMAGES_DIR}/model_comparison.png") + plt.close() + + return results_df + +if __name__ == "__main__": + ensure_dir(IMAGES_DIR) + + # Load Data + X, y, feature_names = load_and_preprocess_data(DATA_PATH) + print(f"Data loaded. X shape: {X.shape}, y shape: {y.shape}") + + # 1. Gradient Descent Analysis + # Note: We use X (raw) initially inside the function to show scaling effect + run_gradient_descent_analysis(X, y) + + # 2. LWLR Analysis + best_k = run_lwlr_analysis(X, y) + + # 3. Feature Analysis + run_feature_analysis(X, y, feature_names) + + # 4. Model Comparison + results = run_model_comparison(X, y, best_k) + + print("Analysis Complete.") + print(results) + diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..9edabea --- /dev/null +++ b/src/models.py @@ -0,0 +1,107 @@ +import numpy as np +from sklearn.linear_model import Lasso, Ridge + +class LinearRegressionGD: + def __init__(self, learning_rate=0.01, n_iterations=1000, initialization='zeros'): + self.learning_rate = learning_rate + self.n_iterations = n_iterations + self.initialization = initialization + self.weights = None + self.bias = None + self.loss_history = [] + + def fit(self, X, y): + n_samples, n_features = X.shape + + # Initialize weights + if self.initialization == 'zeros': + self.weights = np.zeros(n_features) + self.bias = 0 + elif self.initialization == 'random': + np.random.seed(42) + self.weights = np.random.randn(n_features) + self.bias = np.random.randn() + else: + raise ValueError("Initialization must be 'zeros' or 'random'") + + self.loss_history = [] + + for _ in range(self.n_iterations): + y_predicted = np.dot(X, self.weights) + self.bias + + # Compute gradients + dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y)) + db = (1 / n_samples) * np.sum(y_predicted - y) + + # Update parameters + self.weights -= self.learning_rate * dw + self.bias -= self.learning_rate * db + + # Compute and record loss (MSE) + loss = np.mean((y_predicted - y) ** 2) + self.loss_history.append(loss) + + def predict(self, X): + return np.dot(X, self.weights) + self.bias + +def lwlr_predict(test_point, X_train, y_train, k=1.0): + """ + Locally Weighted Linear Regression prediction for a single point. + """ + m = X_train.shape[0] + weights = np.eye(m) + + # Calculate weights for each training sample based on distance to test_point + # W(i, i) = exp(|x(i) - x| / -2k^2) + diff_mat = np.tile(test_point, (m, 1)) - X_train + sq_dist = np.sum(diff_mat**2, axis=1) + weights_diag = np.exp(sq_dist / (-2 * k**2)) + np.fill_diagonal(weights, weights_diag) + + # Analytical solution for LWLR: theta = (X^T * W * X)^-1 * X^T * W * y + # We need to add bias term column of ones to X_train for matrix math + X_train_bias = np.c_[np.ones((m, 1)), X_train] + test_point_bias = np.r_[1, test_point] + + xTwx = X_train_bias.T @ weights @ X_train_bias + + # Check for singularity + if np.linalg.det(xTwx) == 0.0: + # Fallback or small regularization could be added here + # For now, return mean or error + print("Singular matrix in LWLR") + return np.mean(y_train) + + theta = np.linalg.inv(xTwx) @ (X_train_bias.T @ weights @ y_train) + + return test_point_bias @ theta + +def lwlr_predict_batch(X_test, X_train, y_train, k=1.0): + """ + Predict for multiple points using LWLR. + """ + predictions = [] + for point in X_test: + predictions.append(lwlr_predict(point, X_train, y_train, k)) + return np.array(predictions) + +class LassoRegressionWrapper: + def __init__(self, alpha=1.0): + self.model = Lasso(alpha=alpha) + + def fit(self, X, y): + self.model.fit(X, y) + + def predict(self, X): + return self.model.predict(X) + +class RidgeRegressionWrapper: + def __init__(self, alpha=1.0): + self.model = Ridge(alpha=alpha) + + def fit(self, X, y): + self.model.fit(X, y) + + def predict(self, X): + return self.model.predict(X) +