diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/.keep" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/.keep" new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/corpus/.keep" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/corpus/.keep" new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/corpus/thelatinlibrary_sample.7z" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/corpus/thelatinlibrary_sample.7z" new file mode 100644 index 0000000000000000000000000000000000000000..bf39a7609dd992721496f527eae5bd5c47e8c9fd Binary files /dev/null and "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/corpus/thelatinlibrary_sample.7z" differ diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/corpus/unigram_freq.7z" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/corpus/unigram_freq.7z" new file mode 100644 index 0000000000000000000000000000000000000000..2617b7d419fc1b596c79356a70f4b47df5f1badc Binary files /dev/null and "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/corpus/unigram_freq.7z" differ diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/corpus/\346\240\207\346\263\250thelatinlibrary\346\227\266\346\211\200\347\224\250\347\232\204\344\273\243\347\240\201.7z" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/corpus/\346\240\207\346\263\250thelatinlibrary\346\227\266\346\211\200\347\224\250\347\232\204\344\273\243\347\240\201.7z" new file mode 100644 index 0000000000000000000000000000000000000000..453e0998b9656316914e7828bd3ef314a00d36cb Binary files /dev/null and "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/corpus/\346\240\207\346\263\250thelatinlibrary\346\227\266\346\211\200\347\224\250\347\232\204\344\273\243\347\240\201.7z" differ diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/.keep" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/.keep" new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/a.model" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/a.model" new file mode 100644 index 0000000000000000000000000000000000000000..338b516301cb14a2411d1192209c962b0bad379c Binary files /dev/null and "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/a.model" differ diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/e.model" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/e.model" new file mode 100644 index 0000000000000000000000000000000000000000..8b9b5acb1e22052c4cd8a0a5721ce48d6e0ca901 Binary files /dev/null and "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/e.model" differ diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/i.model" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/i.model" new file mode 100644 index 0000000000000000000000000000000000000000..f9f3143fad3813b615e8c2c7e132cea5e2ffdc10 Binary files /dev/null and "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/i.model" differ diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/o.model" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/o.model" new file mode 100644 index 0000000000000000000000000000000000000000..3272b25e6bb3b842253c8b973820eedb35702c0d Binary files /dev/null and "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/o.model" differ diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/u.model" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/u.model" new file mode 100644 index 0000000000000000000000000000000000000000..e2431a328dcefa7cbfd054730e684e1bd7ea2fd3 Binary files /dev/null and "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/u.model" differ diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/y.model" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/y.model" new file mode 100644 index 0000000000000000000000000000000000000000..1221fd6666422d7f4f7b680613e2ceb31a5f2102 Binary files /dev/null and "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/epoche_5/y.model" differ diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/.keep" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/.keep" new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_a.txt" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_a.txt" new file mode 100644 index 0000000000000000000000000000000000000000..2a4ccf84c583e3c1592cbdd701435bda443e1d6e --- /dev/null +++ "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_a.txt" @@ -0,0 +1,102 @@ +threshold accuracy f1_score matthews_corrcoef +0.56 0.9689926115373686 0.9489077070393254 0.9268248862277796 +0.55 0.9689755612389883 0.948946547608192 0.9268058021162002 +0.58 0.9689954532537652 0.9487833673275392 0.926800127084983 +0.5700000000000001 0.9689755612389883 0.9488127416069072 0.9267666569037064 +0.54 0.9689329354930378 0.9489446856006034 0.9267302163581347 +0.53 0.9689073600454675 0.9489708114813925 0.9266984461849606 +0.6 0.9689471440750214 0.9485845760785947 0.9266674663769475 +0.52 0.968868996874112 0.9489784037296298 0.9266412505028432 +0.59 0.9689215686274509 0.9486017215004265 0.9266146153102338 +0.61 0.9689087809036658 0.948461281190087 0.9265711863290823 +0.51 0.9687936913895994 0.9489328754350924 0.9265048644763022 +0.62 0.9688292128445581 0.9482728537407749 0.926379592696319 +0.5 0.9686587098607559 0.9488018642824648 0.9262400899298305 +0.63 0.9687325944870702 0.948054216611509 0.9261495629533444 +0.49 0.9685507246376811 0.948728045995117 0.9260541461147047 +0.64 0.9686246092639954 0.9478144555990395 0.9258947303638154 +0.48 0.9683716965046888 0.9485272694479515 0.9257003919564036 +0.65 0.9685152031827223 0.9475766099746152 0.9256383323034495 +0.47000000000000003 0.9682054560954817 0.9483585844016478 0.9253923076849189 +0.66 0.9683944302358625 0.9473168143772707 0.9253570588100198 +0.46 0.9680093776641091 0.9481396854060953 0.9250201116203598 +0.67 0.9682253481102586 0.946973056156916 0.9249631285893543 +0.68 0.9680974708724069 0.946702778918484 0.9246685002359587 +0.45 0.9677578857630008 0.9478376005112339 0.9245316074554275 +0.6900000000000001 0.9679326513213982 0.9463630033295862 0.9242892086640779 +0.44 0.9674708724069337 0.9474879351156945 0.9239781195472019 +0.7000000000000001 0.9677294685990339 0.9459644648312223 0.9238188324904457 +0.71 0.9675049730036942 0.9455201196794573 0.923301976544928 +0.43 0.9671099744245524 0.94701519868156 0.923256049938324 +0.72 0.967286160841148 0.9450834100569105 0.9228012680583592 +0.42 0.966773231031543 0.9465926401300869 0.9226116792725222 +0.73 0.966979255470304 0.9444991068272786 0.9220914283612865 +0.41000000000000003 0.9662858766695084 0.9459365857659744 0.9216355146811661 +0.74 0.9666837169650468 0.9439300608333014 0.9214117361405969 +0.4 0.9658155726058539 0.9453123756918635 0.9207140977486596 +0.75 0.9663157146916738 0.9432291404747708 0.9205640821915063 +0.76 0.9659903381642512 0.9426061268726861 0.9198174585785153 +0.39 0.9652770673486786 0.9445885522007672 0.9196563809394209 +0.77 0.9656791702188121 0.9420107888021165 0.9191043886005293 +0.38 0.9647200909349247 0.9438362361456684 0.9185661321076494 +0.78 0.9653665814151747 0.9414108976932117 0.9183900489340289 +0.79 0.9650014208581984 0.9407174935379372 0.9175527204768212 +0.37 0.9641361182154021 0.943051754518565 0.917438642254665 +0.8 0.964568059107701 0.9399049062678482 0.9165540141395442 +0.36 0.9634484228473998 0.9421037114390464 0.9160807376873722 +0.81 0.9641403807899972 0.9391004295159501 0.9155703179252901 +0.8200000000000001 0.9637269110542768 0.9383180189474752 0.914622544405609 +0.35000000000000003 0.962674055129298 0.9410291060757755 0.914547475839829 +0.8300000000000001 0.9632935493037795 0.9374960684025375 0.9136308245388923 +0.34 0.9619408922989485 0.9400215856012396 0.9131197739915117 +0.84 0.962766410912191 0.9365070519452514 0.9124177092303949 +0.33 0.9611352657004831 0.9389099696481734 0.9115493727961711 +0.85 0.962128445581131 0.9353130475723583 0.9109472292373217 +0.32 0.9602514919011083 0.9376889100737933 0.9098297910218478 +0.86 0.9614450127877238 0.9340293057213501 0.9093744836154327 +0.31 0.9593606138107417 0.9364620264974809 0.9081095165048242 +0.87 0.9607161125319693 0.9326581612521374 0.9076975675506352 +0.3 0.9584725774367718 0.9352429359253589 0.9064070952994487 +0.88 0.959923273657289 0.9311605521604155 0.9058767620891532 +0.29 0.957544757033248 0.9339726874972378 0.9046396336350618 +0.89 0.9590153452685422 0.9294493157720953 0.9037866125076021 +0.28 0.9566055697641376 0.9326933460272965 0.9028677888088489 +0.9 0.9579539641943734 0.9274432016006041 0.9013440301063514 +0.27 0.9555839727195226 0.9313099883978482 0.9009619177064768 +0.26 0.9544657573174197 0.9297835465586335 0.8988514777397634 +0.91 0.9567220801364024 0.9251130348410384 0.8985049808320252 +0.25 0.9532324524012503 0.928103832787623 0.8965336893836202 +0.92 0.9551705029838022 0.9221760418336766 0.8949213089456868 +0.24 0.9520176186416596 0.9264635431637445 0.8942840317043912 +0.23 0.9507672634271099 0.9247847737465215 0.8919910309588965 +0.93 0.9534441602728048 0.9188783695458417 0.8909455828274212 +0.22 0.9493961352657004 0.9229404446367718 0.8894673049483448 +0.21 0.9478815004262574 0.9208828610807349 0.8866263757693446 +0.9400000000000001 0.9512432509235579 0.9146581975632261 0.8858687492032516 +0.2 0.9462858766695084 0.9187374249797939 0.8836820422264542 +0.19 0.9447087240693379 0.9166313173511581 0.8808015722681731 +0.9500000000000001 0.9484640522875817 0.9092800884413462 0.879463306812143 +0.18 0.9428829212844558 0.914192525171886 0.877456338801144 +0.17 0.9409789712986644 0.911664444401913 0.8739948548060338 +0.96 0.9445893719806763 0.9017028956283271 0.8705288112569171 +0.16 0.9390437624325092 0.9091028512224216 0.8704840971113842 +0.15 0.9369735720375106 0.9063857267673763 0.8667733124942233 +0.14 0.9348579141801648 0.9036238540788591 0.8630032036784835 +0.13 0.9324566638249503 0.9004912950554408 0.8587078263250059 +0.97 0.9388704177323103 0.8903366902953974 0.857336150745417 +0.12 0.9300838306337027 0.8974202357748152 0.8545044261072602 +0.11 0.9277124183006535 0.8943889730762045 0.8503812567867223 +0.1 0.924846547314578 0.8907189742339502 0.845347660847176 +0.09 0.9219963057686843 0.8871034410428625 0.8403983414253619 +0.98 0.9291915316851378 0.8705601225958104 0.8350008011953165 +0.08 0.9188618925831202 0.8831522795587594 0.8349807768515626 +0.07 0.9156550156294402 0.8791401649958059 0.8294745681522 +0.06 0.9121156578573458 0.8747496633490944 0.8234456775600406 +0.05 0.9084967320261438 0.8703045828399268 0.8173429044664589 +0.04 0.9040977550440467 0.8649529007835198 0.8099855508969125 +0.03 0.89923273657289 0.8591045630095837 0.8019393868400095 +0.02 0.8926385336743393 0.8512868555143782 0.7911631444840639 +0.99 0.9054276783177039 0.8189089919738812 0.7799614220728469 +0.01 0.8813270815572606 0.8381864608398137 0.7730782014968388 +0.0 0.3074836601307189 0.4703441725611737 0.0 +1.0 0.6925163398692811 0.0 0.0 diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_e.txt" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_e.txt" new file mode 100644 index 0000000000000000000000000000000000000000..9286bc776877d3bb53f2ea791e96c408af3bb119 --- /dev/null +++ "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_e.txt" @@ -0,0 +1,102 @@ +threshold accuracy f1_score matthews_corrcoef +0.43 0.9904340345285417 0.9763590555572154 0.9703745997572796 +0.44 0.9904320197927643 0.9763437110834372 0.9703561884255352 +0.42 0.9904229534817657 0.9763428016871337 0.970353661602672 +0.46 0.9904380640000967 0.9763396796434499 0.9703534021853815 +0.45 0.9904330271606531 0.9763376561896574 0.9703494985595893 +0.47000000000000003 0.9904269829533205 0.9763022775049935 0.9703081156221388 +0.48 0.9904279903212093 0.9762953723337908 0.9703012042627404 +0.5 0.9904279903212093 0.9762750121096813 0.9702802890745934 +0.41000000000000003 0.9903917250772147 0.9762759924385633 0.9702696662426207 +0.49 0.9904169092744332 0.9762588689209053 0.9702573731739277 +0.51 0.9904118724349895 0.9762252085727132 0.9702205274394506 +0.52 0.9904148945386557 0.9762227836460731 0.9702203993902299 +0.4 0.990367548247885 0.9762278861166083 0.9702094192120846 +0.53 0.9904078429634345 0.9761955951101223 0.9701894478289801 +0.54 0.9904017987561021 0.9761692761742784 0.9701604278384114 +0.39 0.9903383345791116 0.9761669876374479 0.9701334502478811 +0.38 0.990325238796558 0.9761449386236394 0.9701064537782847 +0.55 0.9903695629836625 0.9760790691855373 0.9700513047351241 +0.37 0.9902990472314508 0.9760903353311848 0.9700388588369905 +0.36 0.9902577451480126 0.9759988484609332 0.9699252731458206 +0.56 0.9903232240607805 0.9759547029256863 0.9698993197995965 +0.35000000000000003 0.9902184578003518 0.9759138355294491 0.9698202862597388 +0.5700000000000001 0.9902940103920072 0.9758717836348838 0.9697999280485199 +0.58 0.9902476714691252 0.9757440763281127 0.9696457822874511 +0.34 0.9901499567839176 0.9757563435666786 0.9696245492917056 +0.59 0.9902043546499094 0.9756249185324817 0.9695022036849276 +0.33 0.9900703747207072 0.975572159647101 0.9693956702341503 +0.6 0.99014794204814 0.9754726161038075 0.9693177525702532 +0.32 0.9900179915904929 0.9754555790318467 0.9692520674746136 +0.61 0.9900844778711496 0.975301052145569 0.9691104834719486 +0.31 0.9899333726878389 0.9752608711797691 0.9690109735300975 +0.62 0.9900230284299365 0.9751364432862873 0.9689111822262821 +0.3 0.9898578200961835 0.9750869778237481 0.968795896060579 +0.63 0.9899263211126177 0.9748824498653699 0.9686012722461993 +0.29 0.9897691717219745 0.9748830215556896 0.9685439512108137 +0.28 0.9896875749229868 0.9746970292428884 0.9683150952817311 +0.64 0.989817525380634 0.9745957384778555 0.9682524308385831 +0.27 0.9895727349836706 0.97443168485094 0.9679876965103575 +0.65 0.9897147738559826 0.9743238961287175 0.9679227752900554 +0.66 0.9896351917927724 0.9741101870840305 0.967666049372828 +0.26 0.9894578950443544 0.9741674462922324 0.9676624973660454 +0.67 0.9895374771075647 0.9738507787361965 0.9673528932703089 +0.25 0.9893339887940396 0.9738791057516296 0.9673065756842482 +0.24 0.9892513846271631 0.9736927750054242 0.9670795271471562 +0.68 0.9894327108471359 0.9735733647732197 0.9670179112080943 +0.6900000000000001 0.9893279445867071 0.9732936039769691 0.9666822673844535 +0.23 0.989101286811741 0.9733419080777541 0.9666462722987236 +0.7000000000000001 0.9892020236006149 0.9729605646522257 0.9662807962785306 +0.22 0.9889491742605416 0.9729885453703796 0.9662113171496817 +0.71 0.9890811394539664 0.9726410165153453 0.965895858732511 +0.21 0.9887618038332363 0.9725525894944027 0.9656746778876046 +0.72 0.9889471595247641 0.972284950718136 0.9654689756481499 +0.2 0.9885573081518224 0.9720749224986295 0.9650859246417325 +0.73 0.9887869880304547 0.9718623527835568 0.9649602680011229 +0.19 0.9883286356410789 0.9715422592292388 0.9644305298471488 +0.74 0.9886086839141481 0.9713918516869395 0.9643944818807102 +0.18 0.9881060073376677 0.971026548976592 0.9637981082444615 +0.75 0.9884062029685117 0.9708592335641671 0.9637529840071662 +0.17 0.9878954674489214 0.9705412216959393 0.9632048724695788 +0.76 0.9881956630797654 0.9703029012833768 0.9630858644120838 +0.16 0.9876244854868508 0.9699105767323156 0.9624301183024556 +0.77 0.987935762164471 0.9696191742179017 0.9622636030379668 +0.78 0.9876939938711737 0.96897838949694 0.9614984007489885 +0.15 0.9872970909230109 0.9691504508780256 0.9614976994888643 +0.79 0.9874471887384328 0.9683247796766133 0.960717895478266 +0.14 0.9869122763895128 0.9682573810384761 0.9604025017837378 +0.8 0.9871469931075889 0.9675296417037587 0.9597690996053858 +0.13 0.9865445871101235 0.9674056540465354 0.9593591289562655 +0.81 0.9867944143465305 0.9665969336143041 0.958655352777277 +0.12 0.9860781757776377 0.9663260283719048 0.9580368562851658 +0.8200000000000001 0.9864549313680258 0.9656932034475192 0.9575831833370224 +0.11 0.9856006833983757 0.9652264876173795 0.9566944793965717 +0.8300000000000001 0.9860711242024165 0.9646734201147149 0.9563714533956645 +0.1 0.9850174173907963 0.9638823389421385 0.9550516096455739 +0.84 0.9853206351253065 0.9626632776989181 0.9540034964846096 +0.09 0.984314274604457 0.9622673971991945 0.9530808252336734 +0.85 0.9848582532643756 0.9614270280257752 0.9525445850097236 +0.86 0.9842991640861259 0.9599292475871679 0.9507804371714829 +0.08 0.9834842034641367 0.9603693542343997 0.9507694397619152 +0.87 0.983717912814324 0.9583647814943136 0.9489467180649412 +0.07 0.9824294892846278 0.957964196543048 0.947841486209448 +0.88 0.9829815268876563 0.956381869068151 0.9466221515194483 +0.06 0.9810645057953875 0.9548668720391664 0.9440761094813981 +0.89 0.9821192199748964 0.9540467351176145 0.9438999979511103 +0.9 0.9811138668219357 0.9513104723517862 0.9407251011253867 +0.05 0.9793177298763154 0.950938057834354 0.9393176467108458 +0.91 0.9799090548270047 0.9480143049285275 0.9369174556361651 +0.04 0.9769020616791211 0.9455555687471061 0.9328166729260364 +0.92 0.9784493787562231 0.943993716866287 0.9323000165826166 +0.93 0.97656761554006 0.9387628306888754 0.9263405836465058 +0.03 0.9732604267613324 0.9375544022923067 0.9231924539795024 +0.9400000000000001 0.9740794168548765 0.9317662907284785 0.9184460358246912 +0.02 0.9672615509838962 0.9246524050533129 0.907754525820366 +0.9500000000000001 0.9706432849863904 0.9219524998125275 0.9075129929848407 +0.96 0.965680990766466 0.9074555994414956 0.8916552814222822 +0.01 0.9533326751863127 0.896025137470542 0.8738832498282166 +0.97 0.9583332493860093 0.8852502677179335 0.8680080646503308 +0.98 0.9459819117021898 0.8457979266416483 0.827727864356721 +0.99 0.9213880320665346 0.7582639187904058 0.7449078552166843 +0.0 0.20153099771730437 0.3354570095988826 0.0 +1.0 0.7984690022826957 0.0 0.0 diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_i.txt" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_i.txt" new file mode 100644 index 0000000000000000000000000000000000000000..4fb1b27dbaf52176fb31f1a71f76ef0539e6b7af --- /dev/null +++ "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_i.txt" @@ -0,0 +1,102 @@ +threshold accuracy f1_score matthews_corrcoef +0.46 0.9854581326289605 0.972775571963276 0.9628577444279904 +0.47000000000000003 0.9854591454348786 0.9727604754631781 0.9628478665371665 +0.48 0.9854591454348786 0.972746410829301 0.9628380825013401 +0.45 0.9854277484514198 0.9727366781495503 0.9627941009644334 +0.43 0.9854044539153051 0.9727284941912507 0.9627644113787661 +0.44 0.9854095179448953 0.9727205589956258 0.9627623098961319 +0.49 0.9854287612573378 0.9726745235087986 0.962750543083079 +0.42 0.9853730569318463 0.9726866281356855 0.9626995139701157 +0.5 0.9854074923330592 0.9726194005457877 0.962686639398786 +0.51 0.9853943258561249 0.9725806788948613 0.9626446959891022 +0.41000000000000003 0.985325455053699 0.9726178844580179 0.9625972473381382 +0.4 0.9853021605175843 0.9725930122757319 0.9625563901433246 +0.52 0.9853548264253218 0.9724918293504191 0.9625354530003396 +0.39 0.9852677251163714 0.9725492268264972 0.9624899487158818 +0.53 0.9853143141886007 0.9724013490981902 0.9624242413096931 +0.54 0.9852758275637156 0.9723142667794054 0.9623184837939155 +0.38 0.9851856878370112 0.9724158870309221 0.962302287575806 +0.37 0.9851218810641754 0.9723164480627167 0.9621618328829465 +0.55 0.985215059208634 0.9721848550556189 0.9621559326865292 +0.56 0.9852018927316996 0.9721463389526138 0.9621163976249172 +0.36 0.9850479462321594 0.9721965358198188 0.9619944817334634 +0.5700000000000001 0.9851492268239622 0.972032173586302 0.9619755408006812 +0.35000000000000003 0.9849770498178975 0.9720839457751397 0.9618375956598434 +0.58 0.9850914968866346 0.9719101123595505 0.9618227229986753 +0.59 0.985035792561143 0.9717909994673223 0.9616751775038849 +0.6 0.9850003443540122 0.9717105112355687 0.9615801315005328 +0.34 0.9848545003018162 0.971876492780116 0.9615515824060037 +0.61 0.9848808332556849 0.9714664719577044 0.9612684028802148 +0.33 0.9847056178318662 0.9716215428965532 0.9612012803727398 +0.62 0.9847876551112263 0.9712748094230864 0.9610254544139554 +0.32 0.9846012988223093 0.9714485307373355 0.9609639990208674 +0.63 0.98470359222003 0.9711015397332303 0.9608065588842878 +0.31 0.984480774918064 0.9712454188395707 0.9606862717436827 +0.64 0.9845810427039488 0.9708528936505869 0.960488662442173 +0.3 0.9843906351913595 0.9710988512299678 0.9604867534594932 +0.29 0.9842407399154914 0.9708427494762563 0.9601380106102246 +0.65 0.984443301099097 0.9705760654682544 0.960132190200338 +0.66 0.9843359436717861 0.9703558600174039 0.9598541184621872 +0.28 0.9840574220443284 0.9705277151068631 0.9597096459479855 +0.67 0.9841982020669343 0.9700785529081453 0.9594983044847843 +0.27 0.9839338597223292 0.9703212963361404 0.9594306695247635 +0.68 0.9840361531200499 0.9697537236961785 0.9590801623510592 +0.26 0.9837312985387237 0.9699709299608348 0.958955441091477 +0.6900000000000001 0.9838852450382638 0.9694496873157927 0.9586910222414249 +0.25 0.9835378526083803 0.9696391213389122 0.958506826288244 +0.7000000000000001 0.9837231960913794 0.9691227921864584 0.9582735217529079 +0.24 0.9833514963194633 0.9693199549075381 0.9580761772690379 +0.71 0.9835611471444949 0.968796799514006 0.9578563564353773 +0.23 0.9831134869287268 0.9689106514535898 0.9575242268454072 +0.72 0.9833413682602831 0.9683571821578215 0.9572908696631107 +0.22 0.9828481317782036 0.9684527014062625 0.9569069608694541 +0.73 0.9831053844813826 0.967884529858088 0.9566841008815717 +0.21 0.9825635333152378 0.9679624839496065 0.9562475071729446 +0.74 0.9829048489096132 0.9674795338200133 0.9561691441173588 +0.2 0.9822931141351245 0.9674974948735539 0.9556231832647559 +0.75 0.9826607626833685 0.9669884266668723 0.9555425776563727 +0.76 0.9824268045163042 0.9665163365772983 0.9549425981105304 +0.19 0.9819588881821755 0.966919418429522 0.9548461006753965 +0.77 0.982198923184748 0.9660556057691565 0.9543587264249375 +0.18 0.9815608554563906 0.9662273965080991 0.9539145199981071 +0.78 0.9818525435607826 0.9653644675545591 0.9534696591748983 +0.17 0.9811618099246877 0.9655378139116064 0.952989264280361 +0.79 0.9815274328610957 0.964711436327157 0.9526363318993717 +0.16 0.9807252905740179 0.9647818644459867 0.9519744773767732 +0.8 0.9811658611483599 0.9639841341162981 0.9517100260598385 +0.15 0.980250284598463 0.9639600344505107 0.9508721247174847 +0.81 0.9807809948995094 0.9632095481459365 0.9507243170770308 +0.8200000000000001 0.9804011926802491 0.9624410685206428 0.9497530036636358 +0.14 0.9797165358796625 0.9630391091146328 0.9496389329854934 +0.8300000000000001 0.9799504940467268 0.9615291931935668 0.9486004980258426 +0.13 0.979109865134764 0.9619921942662131 0.948237006815191 +0.84 0.9794207965515984 0.9604584861782763 0.9472455003973137 +0.12 0.9784576181235543 0.9608725405071301 0.9467422280214663 +0.85 0.9788505868197489 0.9593045010211878 0.9457870155184918 +0.11 0.9777749869348037 0.9597078340573829 0.9451931200471391 +0.86 0.9781618787954902 0.9579057926876885 0.9440267807154546 +0.1 0.9768837177269394 0.958181258519854 0.9431567815719463 +0.87 0.9774711451593954 0.9564989517819706 0.9422622007724746 +0.09 0.9756987348028472 0.956154040136395 0.9404519371353048 +0.88 0.976646721142121 0.9548169398264632 0.9401558138655812 +0.89 0.9756815171022407 0.952841287392984 0.9376902563411496 +0.08 0.974255486369658 0.9536929587574213 0.9371699919565479 +0.9 0.9745623665628202 0.9505407513016627 0.9348324882531343 +0.07 0.972494216878208 0.9507132227505354 0.9332100566210166 +0.91 0.9733054744185481 0.9479457357451658 0.9316233380221253 +0.06 0.9703055432893506 0.9470350732989494 0.928330996225006 +0.92 0.9716991642325564 0.944614134366669 0.9275205394555042 +0.93 0.9696765908142555 0.9403877781007713 0.9223556690771166 +0.05 0.9673299195021856 0.9420726834550591 0.9217567204101295 +0.9400000000000001 0.9671192558712359 0.93499888878433 0.9158224431599346 +0.04 0.9632189402809118 0.9353096465508335 0.912832827458089 +0.9500000000000001 0.9636777413617783 0.9276548703807181 0.9070312399035896 +0.03 0.9569810686317802 0.9252199387674979 0.8995588122322399 +0.96 0.9588345034617707 0.9171387856535628 0.8946567841519151 +0.02 0.946381041893704 0.9085734515719577 0.8777896856531707 +0.97 0.9514703916317924 0.9007588686560734 0.8758057550153014 +0.98 0.9387556261368747 0.8712697608474439 0.8431438696821653 +0.01 0.922514270435385 0.8731988186040036 0.8319234887584478 +0.99 0.9105996216157091 0.799951047271112 0.7699319504806044 +0.0 0.26752458079963054 0.4221213297983697 0.0 +1.0 0.7324754192003695 0.0 0.0 diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_o.txt" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_o.txt" new file mode 100644 index 0000000000000000000000000000000000000000..0da46463399d2ee41cfdd2ee6f9619f83ba8bdb8 --- /dev/null +++ "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_o.txt" @@ -0,0 +1,102 @@ +threshold accuracy f1_score matthews_corrcoef +0.59 0.9894694362218434 0.990087155069195 0.9788807734371113 +0.61 0.9894694362218434 0.9900840784201375 0.9788772970843751 +0.6 0.9894546461041773 0.9900715540375217 0.9788491308265946 +0.58 0.9894504203562728 0.9900707753067958 0.9788444663312986 +0.62 0.9894525332302251 0.990066699565418 0.978841862154128 +0.63 0.9894483074823205 0.990061295971979 0.9788320132944275 +0.64 0.9894461946083682 0.9900577822982745 0.9788264129935058 +0.5700000000000001 0.98942929161675 0.990052155101278 0.978803678617924 +0.65 0.9894229529948931 0.9900343601565517 0.9787785000095722 +0.55 0.9894039371293227 0.9900312678279294 0.9787568846457002 +0.56 0.9893933727595613 0.9900196029312943 0.9787332368148063 +0.54 0.9893785826418954 0.9900089237979206 0.9787082087752563 +0.53 0.989376469767943 0.9900085050911316 0.9787063978749397 +0.66 0.9893828083897999 0.9899940860568336 0.9786961251306724 +0.52 0.9893701311460862 0.9900040333556525 0.9786960798194421 +0.67 0.9893722440200385 0.9899822348976713 0.9786737239869598 +0.51 0.9893468895326112 0.9899837897112342 0.9786521215866855 +0.5 0.9893173092972792 0.9899572148750808 0.978594873935645 +0.68 0.9893194221712315 0.9899307803396246 0.9785667767717895 +0.49 0.9892729389442814 0.9899168843033475 0.9785082449373651 +0.48 0.9892370200870927 0.9898848694008364 0.9784393506128994 +0.6900000000000001 0.9892412458349972 0.9898549973900328 0.9784088888956269 +0.47000000000000003 0.9892095527257131 0.9898608069662033 0.9783875598046956 +0.7000000000000001 0.9891672952466675 0.9897832502695172 0.9782597049529643 +0.46 0.9891165862718128 0.9897750348871601 0.978203983400578 +0.71 0.9891081347760037 0.9897254472071354 0.9781403935319984 +0.45 0.989065877296958 0.9897288210274354 0.9781050403384702 +0.44 0.9890384099355785 0.9897047150341324 0.9780534238084657 +0.72 0.9890299584397694 0.9896497418415963 0.9779830978970506 +0.43 0.9889877009607237 0.9896587711953525 0.977955199767321 +0.73 0.9889855880867715 0.9896056197048578 0.9778938041532074 +0.42 0.9889454434816782 0.9896208186698302 0.9778741577326735 +0.41000000000000003 0.9889179761202986 0.9895966598236688 0.9778227049792823 +0.74 0.9889306533640123 0.9895516940854091 0.977783514639169 +0.4 0.9888609285235871 0.9895444953910105 0.9777113544859123 +0.39 0.988837686910112 0.9895244043901134 0.9776687886454577 +0.75 0.9888567027756825 0.9894796055151284 0.977635263284689 +0.38 0.9887552848259732 0.9894488082823485 0.9775075021218345 +0.76 0.9887743006915436 0.9893994624910963 0.9774702760529216 +0.37 0.9886580926241684 0.9893596777774253 0.9773174178059355 +0.77 0.9886623183720729 0.9892911738996811 0.9772462022716343 +0.36 0.9885503360526021 0.9892602259732487 0.9771051151791285 +0.78 0.9885482231786499 0.9891799953685589 0.9770185085691007 +0.35000000000000003 0.9884552567247495 0.9891732517496373 0.9769198376993325 +0.34 0.9883876447582767 0.9891121315736299 0.9767900925733087 +0.79 0.9884003220019904 0.9890369269847356 0.9767233564036014 +0.33 0.9883031298001855 0.9890348879126277 0.976625737052492 +0.32 0.988184808858858 0.9889261625351008 0.9763940927075077 +0.8 0.9882249534639512 0.9888675593127569 0.9763736793385334 +0.31 0.9880369076821984 0.9887898058905987 0.9761033038321231 +0.81 0.9880622621696258 0.9887099803773465 0.9760500068585012 +0.3 0.9879333768585368 0.9886953647234411 0.9759026783295827 +0.29 0.9877960400516387 0.9885688501165676 0.9756330022750652 +0.8200000000000001 0.9878066044214 0.9884633760667529 0.9755412178099068 +0.28 0.9876650418665973 0.9884483877605938 0.9753764109413336 +0.27 0.9875382694294607 0.9883324827005021 0.9751301526893533 +0.8300000000000001 0.9875361565555084 0.9882023539293822 0.9750038015811198 +0.26 0.9873650137653738 0.9881730297612455 0.9747904356325933 +0.25 0.9871980967231437 0.98801970155038 0.9744640485671608 +0.84 0.9872319027063803 0.9879086556257866 0.97439998702376 +0.24 0.9869973736976774 0.9878353001640672 0.9740714969506686 +0.85 0.986881165630302 0.9875695446837731 0.9737054912765767 +0.23 0.9867818605545449 0.9876371454769659 0.9736494627432036 +0.22 0.9865642345374602 0.9874375482772313 0.9732248881333779 +0.21 0.9863592857640892 0.9872497748566191 0.9728256866981034 +0.86 0.9864311234784666 0.9871348496928951 0.9728149089703013 +0.2 0.9860761606544839 0.9869896549001026 0.9722717163128373 +0.19 0.9858458573936855 0.9867788200605108 0.971823597610378 +0.87 0.9858754376290174 0.9865980228825979 0.9717167037097608 +0.18 0.9855880867715074 0.9865428739970289 0.9713220942188608 +0.17 0.985296510166093 0.986275758836743 0.9707539114214109 +0.88 0.9852331239475247 0.9859765359294834 0.9704504489585895 +0.16 0.9849140799807306 0.9859249884679527 0.9700069020420736 +0.15 0.9845612300307001 0.9856017718470994 0.9693188843553249 +0.89 0.9844936180642272 0.9852603588205876 0.9689953460948165 +0.14 0.9841492196100057 0.985224546711467 0.9685157876244572 +0.13 0.9837308705674546 0.9848419825899344 0.9677016001169504 +0.9 0.9836780487186476 0.984467897305155 0.9673978021932413 +0.12 0.9832702640458578 0.984420930022076 0.9668051956879529 +0.11 0.9827272554401222 0.9839249793040252 0.9657492537969955 +0.91 0.9824737105658488 0.9832974582738492 0.965041169274909 +0.1 0.9821102962460568 0.9833623825178275 0.9645518663524787 +0.09 0.9815165786654666 0.9828218642243918 0.9634019195145367 +0.92 0.9809904730513491 0.9818513370947213 0.9621518754328083 +0.08 0.9808277817570237 0.9821951353515058 0.9620677814456837 +0.07 0.9798643112347847 0.9813185606862538 0.960199220711216 +0.93 0.9790107101580641 0.9799149204603353 0.9583120577769876 +0.06 0.978744488040077 0.9803020097432604 0.9580324134268737 +0.05 0.9773161852483366 0.9790077156812521 0.9552713733126795 +0.9400000000000001 0.9764055365749046 0.9773550853214638 0.9532886756770073 +0.04 0.9753892442038585 0.9772682919213446 0.9515619007997584 +0.03 0.9727840706206989 0.9749261761688692 0.9465638683010106 +0.9500000000000001 0.9727566032593193 0.9737480759990553 0.9463025242846116 +0.02 0.968545645472428 0.9711375343889287 0.9384672583990805 +0.96 0.9676033036897118 0.9686085434388724 0.9365361918436202 +0.01 0.9597476383351399 0.96336606818361 0.9218253093135342 +0.97 0.9589172788718944 0.9598272762959443 0.9203104593323731 +0.98 0.9426037790863511 0.9429027233919124 0.8906384466464213 +0.99 0.9025141087158164 0.898865666407286 0.8212241902290963 +0.0 0.5294228262224561 0.6923171501632224 0.0 +1.0 0.47057717377754393 0.0 0.0 diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_u.txt" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_u.txt" new file mode 100644 index 0000000000000000000000000000000000000000..628e76db41ceab1b5a591c7f5d4392a311f2a0c8 --- /dev/null +++ "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_u.txt" @@ -0,0 +1,102 @@ +threshold accuracy f1_score matthews_corrcoef +0.52 0.9928140367988229 0.9711268448003827 0.9670232979845143 +0.5 0.9928012781346266 0.9711195031508065 0.9670099828107956 +0.48 0.992784266582365 0.9710933418141341 0.9669780960055968 +0.51 0.992794189987851 0.9710688530561832 0.96695425543699 +0.49 0.9927800136942996 0.9710544412933146 0.9669343593509341 +0.54 0.9928026957639817 0.9710393766364149 0.9669308284450691 +0.53 0.9927970252465612 0.9710360552942852 0.96692325236492 +0.47000000000000003 0.9927587492539726 0.9710102156640181 0.9668830741503012 +0.55 0.9927956076172061 0.9709865266042474 0.9668760066968082 +0.46 0.9927374848136455 0.9709489920326632 0.9668138168120322 +0.56 0.9927729255475239 0.9708745629470509 0.9667535454750731 +0.45 0.992710549855898 0.9708645444964472 0.9667187888720384 +0.5700000000000001 0.9927389024430007 0.9707143592265205 0.9665774056803417 +0.44 0.9926623504578234 0.9706964684035916 0.9665290676928263 +0.58 0.992721890890739 0.9706239128444567 0.9664812645304932 +0.43 0.9926311626120105 0.9705972192368173 0.9664189601188978 +0.59 0.9927062969678325 0.970541588177701 0.9663941855137054 +0.42 0.9925886337313564 0.9704552647045527 0.9662613456382803 +0.41000000000000003 0.9925772926965153 0.9704337809299016 0.96624164465477 +0.6 0.9926538446816926 0.9703074683993995 0.9661355369241852 +0.4 0.9925276756690856 0.9702594889213635 0.9660478313759807 +0.61 0.9926226568358796 0.9701581548977555 0.9659748090714142 +0.62 0.9925772926965153 0.9699512195121951 0.9657491604935617 +0.39 0.9924397826490673 0.9699422291108919 0.9656937598465762 +0.63 0.9925503577387678 0.9698170644151517 0.9656081554919279 +0.64 0.9925290932984407 0.9697084655354761 0.9654957618685093 +0.38 0.9923462191116283 0.9695986846180267 0.965309469956307 +0.65 0.9924738057535905 0.969455679379563 0.965222995740715 +0.37 0.992268249497096 0.9693154199297866 0.9649943088483127 +0.66 0.992394418509703 0.9691043426682561 0.9648391661156392 +0.67 0.9923618130345349 0.9689452449567723 0.9646741081896698 +0.36 0.9921761035890122 0.9689774764899973 0.9646177898999926 +0.68 0.9922937668254884 0.9686443708685671 0.9643466147029192 +0.35000000000000003 0.9920867929396388 0.9686471427448073 0.9642491061106683 +0.6900000000000001 0.992215797210956 0.9682976819375885 0.9639711466263379 +0.34 0.9919450300041253 0.9681148359726603 0.9636520664915862 +0.7000000000000001 0.9921378275964235 0.9679503478883983 0.9635961765311366 +0.33 0.9918798190537891 0.9678815745205787 0.9633960810947251 +0.71 0.9920598579818912 0.967604615518089 0.9632225756256451 +0.32 0.9917933436631259 0.9675702625637922 0.9630541116528107 +0.72 0.9919804707380037 0.9672560993256736 0.9628440708604101 +0.31 0.9917182093073038 0.9673050447163117 0.962766274776595 +0.73 0.9919110068996021 0.9669458830071948 0.9625114178482207 +0.3 0.9916133047350238 0.966924220907739 0.9623477572189025 +0.74 0.9918287843970043 0.9665839575168703 0.9621199782768654 +0.29 0.9915296646030709 0.9666255187707019 0.9620229855549316 +0.75 0.9917564852998924 0.9662626696294405 0.9617752256173709 +0.28 0.9914190895133703 0.9662200246666927 0.9615759974235633 +0.76 0.9916983624963319 0.9660025080117041 0.9614978631420541 +0.77 0.9916133047350238 0.9656286311875436 0.9610942354679467 +0.27 0.9912773265778569 0.9656975609756098 0.9609988567657044 +0.78 0.9915126530508093 0.9651876101151885 0.9606173323685533 +0.26 0.9911284754955678 0.9651469752832017 0.9603896717530007 +0.79 0.991383648779492 0.9646278298318105 0.9600076985505027 +0.25 0.9909654481197273 0.9645477656692424 0.9597296414719619 +0.8 0.9912943381301186 0.964230587769319 0.9595839163783688 +0.81 0.991193686445904 0.9637856051861438 0.9591072633284868 +0.24 0.9907712328980739 0.9638365478624122 0.9589486275041685 +0.8200000000000001 0.9910816937268483 0.9632948836883654 0.9585779550067967 +0.23 0.9905841058231961 0.9631458629261365 0.9581864304250747 +0.8300000000000001 0.9909810420426338 0.9628522380913455 0.9581022157084337 +0.84 0.9908803903584192 0.9624027632478682 0.9576259104192758 +0.22 0.9904026492657388 0.962478107610792 0.9574511531607405 +0.85 0.9907031866890275 0.9616334171883227 0.9567900742851272 +0.21 0.9902070164147303 0.9617603099916967 0.9566625211763291 +0.86 0.9905359064251216 0.9609046509176515 0.9560008031815556 +0.2 0.9899816133472639 0.9609344337510572 0.955756111702252 +0.87 0.9903445264621784 0.9600633263947931 0.9550973562345781 +0.19 0.9897562102797975 0.960111286529693 0.9548549285928074 +0.88 0.9901503112405249 0.959213863060017 0.954180784701494 +0.18 0.9895322248416862 0.9592925818117667 0.9539578432504391 +0.89 0.9899149847675726 0.958182951058652 0.9530699575414446 +0.17 0.989267128152276 0.9583289759749016 0.9529063944303346 +0.16 0.9889921080573799 0.9573227368409483 0.9518020303436662 +0.9 0.9896427999313867 0.9569896271178459 0.9517847449327415 +0.15 0.9886944058928017 0.9562392655878754 0.9506173057797837 +0.91 0.9893564388016496 0.9557217333868037 0.9504322591473706 +0.14 0.9883995389869337 0.9551741705057765 0.94945935208748 +0.92 0.9888730271915487 0.953597124428758 0.9481470934822537 +0.13 0.9880422963894397 0.9538761025169924 0.9480395668876874 +0.12 0.9876751303864599 0.9525421138246887 0.9465788368998262 +0.93 0.988212411912056 0.950681210223195 0.9450202421092345 +0.11 0.9872626002441158 0.9510490272458336 0.9449471226481888 +0.1 0.9867990354449868 0.9493769978472176 0.9431224921711455 +0.9400000000000001 0.9875191911573952 0.9476008522896356 0.9417341602738616 +0.09 0.9862348189616432 0.9473461596859207 0.9409059878318029 +0.08 0.9856082267866737 0.9451071146629755 0.9384725960445981 +0.9500000000000001 0.9864460457355583 0.9427988537036263 0.9366359174808223 +0.07 0.9848455421936113 0.9423862546214955 0.9355113163079308 +0.06 0.9839737001402036 0.9392912569743902 0.9321461973906773 +0.96 0.9848554655990972 0.9355906451787942 0.9290549447454254 +0.05 0.9828707845019088 0.9354116219524581 0.927945617515753 +0.04 0.9814574080348396 0.9304802602206773 0.922616631300931 +0.97 0.9822640391379113 0.9236468384017772 0.916630686160784 +0.03 0.9794798150844269 0.9236562712615306 0.9152614267101659 +0.02 0.9764630998167005 0.9134444450236942 0.9043256938360562 +0.98 0.9772966658775196 0.9000143594738188 0.8925398313065815 +0.01 0.9706380607964525 0.8943286871696496 0.8840242131250227 +0.99 0.9643735566761128 0.8333079071124878 0.8279558886133505 +0.0 0.12439272302499423 0.22126205635756163 0.0 +1.0 0.8756072769750057 0.0 0.0 diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_y.txt" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_y.txt" new file mode 100644 index 0000000000000000000000000000000000000000..df8cad9c8db3aae0e4505f98ec0f3efeeda30a5a --- /dev/null +++ "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/evaluation/eval_y.txt" @@ -0,0 +1,102 @@ +threshold accuracy f1_score matthews_corrcoef +0.19 0.987866927592955 0.9495114006514658 0.9430893605208979 +0.15 0.9876712328767123 0.9490703314470493 0.942804073479789 +0.14 0.9876712328767123 0.9490703314470493 0.942804073479789 +0.16 0.9876712328767123 0.9489878542510122 0.9426570762030837 +0.2 0.987866927592955 0.9491803278688524 0.9425710100584036 +0.21 0.987866927592955 0.949096880131363 0.9424483815243211 +0.22 0.987866927592955 0.949096880131363 0.9424483815243211 +0.23 0.987866927592955 0.9490131578947368 0.9423285833999039 +0.17 0.9876712328767123 0.9487388120423109 0.9422319643332772 +0.18 0.9876712328767123 0.9487388120423109 0.9422319643332772 +0.24 0.9876712328767123 0.9481481481481482 0.9413355164150552 +0.12 0.987279843444227 0.9476228847703466 0.9412701923824878 +0.13 0.987279843444227 0.9476228847703466 0.9412701923824878 +0.26 0.9876712328767123 0.9479768786127167 0.9411047706794936 +0.27 0.9874755381604696 0.9471074380165289 0.9401113767781424 +0.36 0.9874755381604696 0.9465776293823038 0.9394906681213889 +0.25 0.987279843444227 0.9464138499587799 0.9393476137942629 +0.38 0.9874755381604696 0.9463986599664992 0.939308032889167 +0.39 0.9874755381604696 0.9463986599664992 0.939308032889167 +0.4 0.9874755381604696 0.9463986599664992 0.939308032889167 +0.11 0.986692759295499 0.9453376205787782 0.9387588825234728 +0.37 0.987279843444227 0.9456066945606695 0.938404905544659 +0.1 0.9864970645792563 0.9445783132530121 0.937925463849018 +0.3 0.9870841487279843 0.9451827242524917 0.9379045444826889 +0.35000000000000003 0.9870841487279843 0.9450000000000001 0.937698181523906 +0.41000000000000003 0.9870841487279843 0.9446308724832214 0.9373219972754668 +0.28 0.9868884540117416 0.9444904722452362 0.9371276318344899 +0.29 0.9868884540117416 0.9444904722452362 0.9371276318344899 +0.31 0.9868884540117416 0.944305901911887 0.9369096462442917 +0.32 0.9868884540117416 0.944305901911887 0.9369096462442917 +0.33 0.9868884540117416 0.9442131557035803 0.9368051123685351 +0.34 0.9868884540117416 0.9442131557035803 0.9368051123685351 +0.42 0.9868884540117416 0.9437447523089841 0.9363280775108732 +0.09 0.9859099804305284 0.9423076923076924 0.9354361630956703 +0.43 0.986692759295499 0.942857142857143 0.9353335535601789 +0.45 0.986692759295499 0.9426644182124789 0.9351649136232277 +0.46 0.986692759295499 0.9426644182124789 0.9351649136232277 +0.48 0.986692759295499 0.9425675675675675 0.9350853857127959 +0.49 0.986692759295499 0.9425675675675675 0.9350853857127959 +0.44 0.9864970645792563 0.9418702611625948 0.9342526988851971 +0.47000000000000003 0.9864970645792563 0.9417721518987343 0.9341701568116597 +0.5 0.9864970645792563 0.9414758269720102 0.9339419110499494 +0.51 0.986105675146771 0.939677145284622 0.9319517820735683 +0.52 0.9859099804305284 0.9387755102040817 0.9309557953055786 +0.53 0.9859099804305284 0.938671209540034 0.9308869198171993 +0.56 0.9859099804305284 0.9385665529010239 0.9308213942323759 +0.55 0.9859099804305284 0.9385665529010239 0.9308213942323759 +0.54 0.9857142857142858 0.937766410912191 0.9298905306107951 +0.5700000000000001 0.9857142857142858 0.9375534644995722 0.9297633126381263 +0.08 0.9845401174168298 0.9370517928286852 0.9296908136807414 +0.58 0.9849315068493151 0.9339055793991415 0.9257733298506917 +0.07 0.9831702544031311 0.9319620253164558 0.924238929635699 +0.59 0.9843444227005871 0.9311531841652324 0.9227742309435587 +0.6 0.9839530332681018 0.9293103448275861 0.9207716485640204 +0.61 0.9835616438356164 0.9274611398963731 0.9187664978752441 +0.62 0.9833659491193738 0.9265341400172861 0.9177629535143009 +0.63 0.9831702544031311 0.9256055363321799 0.9167587599729842 +0.06 0.9810176125244618 0.9240407204385279 0.9157732206431445 +0.65 0.9825831702544031 0.9228100607111882 0.9137422602987627 +0.64 0.9825831702544031 0.9228100607111882 0.9137422602987627 +0.66 0.9823874755381604 0.9218749999999999 0.9127354460107304 +0.67 0.9819960861056751 0.9199999999999999 0.9107198312345463 +0.68 0.9818003913894325 0.918918918918919 0.9096571974772688 +0.6900000000000001 0.9818003913894325 0.918918918918919 0.9096571974772688 +0.7000000000000001 0.9818003913894325 0.918918918918919 0.9096571974772688 +0.05 0.9786692759295499 0.9155693261037955 0.9067947137287664 +0.71 0.9812133072407045 0.9159369527145359 0.9065779380625489 +0.72 0.9810176125244618 0.9149868536371605 0.9055669071555454 +0.73 0.9806262230919766 0.9129287598944591 0.9034977004988515 +0.74 0.9804305283757339 0.9118165784832452 0.9024437484905495 +0.04 0.9772994129158513 0.9109062980030721 0.9020781361283716 +0.75 0.9796477495107632 0.9078014184397164 0.8983502233954263 +0.76 0.9790606653620352 0.9048888888888889 0.8953007584804535 +0.03 0.9747553816046967 0.9020501138952165 0.8928381457279491 +0.77 0.9772994129158513 0.8960573476702509 0.8861139185336214 +0.78 0.9771037181996086 0.895067264573991 0.8850895397544732 +0.79 0.9765166340508806 0.8920863309352518 0.8820119885758686 +0.02 0.9700587084148729 0.8860759493670886 0.8762347497202813 +0.8 0.9749510763209394 0.8840579710144928 0.8737723072185385 +0.8200000000000001 0.9747553816046967 0.8826205641492267 0.8726914807881576 +0.81 0.974559686888454 0.882032667876588 0.8717047885257145 +0.8300000000000001 0.9739726027397261 0.8778696051423324 0.8685162836215113 +0.84 0.9729941291585127 0.8726937269372693 0.8633292222297249 +0.85 0.9720156555772994 0.8674698795180723 0.8581222204436203 +0.86 0.9718199608610567 0.866171003717472 0.8570785212762421 +0.87 0.9714285714285714 0.8640595903165735 0.854989043329277 +0.88 0.9700587084148729 0.8566073102155576 0.8476497573109029 +0.89 0.9681017612524462 0.8457899716177862 0.8370927717122987 +0.01 0.9573385518590998 0.8451704545454546 0.8340554986527192 +0.9 0.9667318982387475 0.8374760994263862 0.8296820009206766 +0.91 0.9651663405088062 0.8278529980657641 0.8211848112380201 +0.92 0.964187866927593 0.8218111002921129 0.8158500081127654 +0.93 0.961252446183953 0.8043478260869565 0.7995899856845476 +0.9400000000000001 0.9583170254403132 0.7859296482412059 0.7831561044173203 +0.9500000000000001 0.9549902152641878 0.7628865979381443 0.7644957099764645 +0.96 0.9475538160469668 0.7118279569892473 0.7209854264374289 +0.97 0.9401174168297456 0.6569506726457399 0.6753231500063263 +0.98 0.9281800391389432 0.55729794933655 0.5966832083326772 +0.99 0.9064579256360078 0.3324022346368715 0.42454643361451366 +0.0 0.11682974559686889 0.2092167513579814 0.0 +1.0 0.8831702544031311 0.0 0.0 diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/my_dataset.py" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/my_dataset.py" new file mode 100644 index 0000000000000000000000000000000000000000..1d5e2c76f6231053b5beda9cac94ef94481c6da1 --- /dev/null +++ "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/my_dataset.py" @@ -0,0 +1,24 @@ +import pandas as pd +import torch + + +# 读取训练数据 +class MyDataset(torch.utils.data.Dataset): + def __init__(self, path): + df = pd.read_csv(path, sep=' ', header=None) + self.X = df.values[:, :-1].astype('int64') + self.y = df.values[:, -1] + self.y = self.y.astype('float32') + self.y = self.y.reshape((len(self.y), 1)) + + # 以下两个函数供torch.utils.data.random_split调用 + def __len__(self): + return len(self.y) + + def __getitem__(self, index): + return [self.X[index], self.y[index]] + + def get_trainset_and_testset(self, test_ratio=0.2): + testset_size = round(len(self.X) * test_ratio) + trainset_size = len(self.X) - testset_size + return torch.utils.data.random_split(self, [trainset_size, testset_size]) diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/my_net.py" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/my_net.py" new file mode 100644 index 0000000000000000000000000000000000000000..5df911769bdff5319078957ee2fade2d6db9bc79 --- /dev/null +++ "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/my_net.py" @@ -0,0 +1,49 @@ +import torch +from my_tools import * + + +# 神经网络 +class NN(torch.nn.Module): + def __init__(self, n_inputs): + super(NN, self).__init__() + # 读取词向量 + _, embeddings_matrix = build_char_int_mapping_and_embeddings_matrix( + 'unmacronized.vector') + self.embedding = torch.nn.Embedding.from_pretrained(embeddings_matrix) + # 读取位置信息 + window = 10 + self.positional_cuda = positional_embedding(window).cuda() + # 以下各层均使用此初始化函数 + initialize = lambda x: torch.nn.init.kaiming_uniform_(x.weight, nonlinearity='relu') + + # 4个dense层 + self.layer_1 = torch.nn.Linear(n_inputs, 128) + initialize(self.layer_1) + self.activation_1 = torch.nn.ReLU() + self.layer_2 = torch.nn.Linear(128, 32) + initialize(self.layer_2) + self.activation_2 = torch.nn.ReLU() + self.layer_3 = torch.nn.Linear(32, 8) + initialize(self.layer_3) + self.activation_3 = torch.nn.ReLU() + self.layer_4 = torch.nn.Linear(8, 1) + initialize(self.layer_4) + self.activation_4 = torch.nn.Sigmoid() + + self.layers = [ + self.layer_1, self.activation_1, + self.layer_2, self.activation_2, + self.layer_3, self.activation_3, self.layer_4, self.activation_4 + ] + + def forward(self, X): + # 嵌入词向量 + X = self.embedding(X) + # 加上位置信息 + X = X * self.positional_cuda + # 展平成1维 + X = X.flatten(start_dim=1) + # 应用每一层 + for layer in self.layers: + X = layer(X) + return X \ No newline at end of file diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/my_tools.py" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/my_tools.py" new file mode 100644 index 0000000000000000000000000000000000000000..c9529d250019919a121882d7624c42b25a4552a5 --- /dev/null +++ "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/my_tools.py" @@ -0,0 +1,36 @@ +import pandas as pd +import numpy as np +import torch + + +# 建立字符和整数之间的映射,以及词向量嵌入矩阵 +def build_char_int_mapping_and_embeddings_matrix(vector_txt): + latinchar_vector = pd.read_csv(vector_txt, + sep=' ', + skiprows=1, + header=None) + char_int_mapping = latinchar_vector[[0]].reset_index() + char_int_mapping['index'] += 1 + char_int_mapping.set_index(0, inplace=True) + char_int_mapping = char_int_mapping.to_dict()['index'] + # 第0行代表词向量本文文件中未提到的字符 + embeddings_matrix = latinchar_vector.drop(columns=[0]).values + embeddings_matrix = np.insert(embeddings_matrix, + 0, + np.random.random( + (1, embeddings_matrix.shape[1])), + axis=0) + embeddings_matrix = torch.FloatTensor(embeddings_matrix) + return char_int_mapping, embeddings_matrix + + +def positional_embedding(window): + # 取元音附近左右各10个字符来预测元音是否加长音 + # 对于靠近元音的字符,加上较高权重;对于远离元音的字符,加上较低权重 + positional = abs(np.array(range(window * 2 + 1), dtype=float) - window) + positional = window - positional + 1 + positional[window] = 0 + positional = positional / window + positional = torch.Tensor(positional).unsqueeze(-1) + positional_cuda = positional + return positional_cuda diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/unmacronized.vector" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/unmacronized.vector" new file mode 100644 index 0000000000000000000000000000000000000000..abf8f17eae685d3701748682ffc891dd888de2b9 --- /dev/null +++ "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/unmacronized.vector" @@ -0,0 +1,28 @@ +27 16 +@ 0.03736192 1.3407425 0.22756751 0.3072071 -0.84254 -0.4211315 1.0424643 -0.33740157 0.04217726 0.02819223 -0.3695598 1.2144175 0.54571825 0.32134956 1.4474334 0.07726425 +e 0.5151084 0.31712723 0.18643542 -1.0779173 -0.11040375 0.8847372 0.57577294 0.34622103 -0.20765014 1.2464855 -0.040029716 -0.4038346 -0.645032 0.437435 -0.31218204 -0.3101551 +i 1.141827 0.86990917 -0.71954525 -0.647558 -0.5202247 0.3932742 -0.28903365 -0.048132814 -0.220453 1.010941 -0.3866953 0.3053795 -0.6439724 0.34935346 -0.3046797 -0.668358 +t 0.3000374 -0.32531074 1.4753296 -0.8174252 -0.2727576 -0.92300147 -1.0419275 0.2394559 0.6474478 0.30066553 0.010620613 -0.26391166 -0.6149605 0.080670625 1.2443511 -0.53291774 +u -0.01565617 1.2886367 0.8761222 0.515973 -0.011085723 -0.018558541 -0.081254125 -0.51820016 0.08746099 0.86239314 -0.18250577 0.42482427 0.24223982 -0.40784 -2.0759492 0.102265835 +a 0.16965851 0.43410665 -0.6986218 -0.34764785 -0.38439628 0.87292254 0.44232056 0.2737173 0.16992939 1.4036032 -0.028768918 0.048497096 -0.16424416 -0.79853296 -0.32189727 0.062245276 +s -0.16524838 -1.1701959 -0.61021703 0.020351514 -0.67782384 -0.7468922 -0.95461285 -0.10652105 0.59673536 0.48790267 -0.38594854 0.41170257 -0.19181798 0.23221296 1.0135231 -0.008093784 +n -0.91794384 -0.06147431 0.23488696 0.003993267 -0.44266063 0.89108866 -1.5685215 -0.38516936 -0.34812096 0.099465586 0.10618633 -0.08248255 0.42388985 0.8794743 1.0431089 0.5220406 +r 0.5818189 -0.84188974 0.4593393 0.56504464 -0.94136405 0.15901074 -0.9830992 -0.3451071 -0.19374546 1.6970513 -0.4310954 -0.2030182 0.8283338 0.5633404 2.1169338 0.44745606 +o 0.04887558 0.18275154 -0.4689348 -0.9678518 0.797708 0.074179776 0.342336 0.62936693 0.3824927 1.4327812 -0.34622294 0.14514913 -0.35507548 -0.3425371 -0.23766448 -0.90450186 +m 0.282649 -0.439233 0.42453912 -0.396274 -0.42079118 -0.4158479 -0.32352352 0.8798871 0.59537154 -0.6220228 0.48890638 -0.41028872 0.8471727 0.15000564 0.93196946 0.2155409 +c 0.67069125 -0.82106817 -0.17933191 -0.8730931 -0.49398646 -0.3746407 -0.75744087 -0.67552716 0.82217777 -0.5477965 1.2874186 0.34766492 0.7472675 0.55993176 0.96923864 -0.7872767 +d -0.82033813 -0.617142 1.0926228 0.22179998 0.07464168 -0.79419214 -1.1676018 -0.1592853 -0.3057166 -0.53726214 -0.25781304 0.07982264 0.5087121 -0.25419837 0.37565956 -0.22248831 +l 0.24742782 0.5632595 -0.39942014 0.24655136 -1.544483 -0.67100537 -1.9403735 0.2358858 0.19798444 0.68141454 -0.0453153 -0.2597482 -0.33099702 -0.04768504 1.1664518 0.7062777 +p -1.0430073 -0.9324396 1.3384033 -0.5456821 -0.024230413 0.05498438 -0.019870473 -0.17553104 1.8378531 -0.20445858 1.6215357 0.24227183 -0.95863485 1.1141088 0.95255595 0.9766639 +q 1.7541552 -2.0221968 0.013373923 0.8750735 -1.1656107 -0.7619582 -1.4068145 2.3216777 -0.12686925 -3.3281398 0.9827847 -1.350802 1.5726597 0.3461608 0.8381054 0.60934776 +b -0.5715424 0.13199362 -0.86638993 -0.1480671 -0.90070206 -0.66471624 -0.53392154 0.14316963 0.7722525 -1.2136884 1.9351372 0.37743863 -1.1019539 0.122101106 0.9435844 -0.48170248 +v -1.0997225 -0.9493142 0.6002022 -1.69077 -1.4326847 -0.9286406 -0.40309897 -0.8891048 0.19032486 -0.23054554 0.10183605 -0.20392726 -1.4821066 0.23104757 -0.23628025 0.4063643 +g 0.13550594 -0.25084922 0.939274 -2.308393 -0.81081593 -1.2118245 -0.11546878 -0.87821335 1.1147275 -0.50162274 1.1731786 0.05089318 0.20129034 -0.47222605 0.1997933 0.06248192 +f -0.15241654 -0.6746029 1.2214694 -1.4144933 -0.8397472 1.0532364 -1.4826883 -0.11585658 1.6915563 -1.5119352 -0.5504938 0.47584304 -0.1577317 -0.74779093 0.37353298 0.41292056 +h 0.3308448 -0.15895319 -0.0044205245 0.20456827 1.4837102 -0.6985748 -1.4400992 -0.32254824 1.2501053 1.5210899 0.42589176 1.1779765 -0.49719268 1.2262808 0.06582477 1.3357706 +x -0.60802805 -1.5827972 0.7603572 0.68372154 -0.5862795 -0.1223567 0.087512955 -0.21684514 -0.15759389 0.32755044 0.82414716 -0.20149758 -0.5245551 -0.5181265 0.53414005 0.5751256 +y 0.030293722 4.0183964 -1.8183737 0.44123346 2.877556 2.3926923 0.22144589 1.3054038 0.012538971 3.6592042 3.5417593 0.62023884 -1.6456021 0.21617515 2.3678298 -0.51037526 +j 2.0441806 -0.76502305 -0.053018197 1.64456 1.0430455 1.3226048 -0.3677843 0.4569834 -2.568891 -1.2171983 1.0023258 0.79760927 -2.2405312 0.122794166 0.5619936 -1.2315358 +z 2.6294737 4.4940753 -0.92292434 -0.09176495 2.3179543 2.4599714 -1.1034288 -0.6080179 -0.25282976 3.3377721 3.5780554 1.4944803 -0.7833738 -0.04712505 2.5834422 -0.48632792 +k 2.0435836 4.5311575 -1.0651007 -2.4265084 3.6327755 1.7922076 -0.35113478 0.117293194 -3.5790498 2.4950218 3.6615045 0.14509186 -1.5019038 1.4732664 2.0449061 -0.8877038 +w 3.0149353 4.319504 -0.5854971 -2.041472 3.968808 2.9274228 -0.52555484 -0.53336686 -3.3396184 2.4152582 3.7330258 0.8324638 -1.6112037 0.54619354 1.377292 -1.9356985 diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250.ipynb" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250.ipynb" new file mode 100644 index 0000000000000000000000000000000000000000..71769b3e08ff7723e1475580d9098fea4107006e --- /dev/null +++ "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250.ipynb" @@ -0,0 +1,560 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "7df3f10d", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-20T08:51:10.623834Z", + "start_time": "2022-09-20T08:51:08.313735Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\users\\deu\\miniconda3\\envs\\python_3.9\\lib\\site-packages\\scipy\\__init__.py:138: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.3)\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion} is required for this version of \"\n" + ] + } + ], + "source": [ + "import os\n", + "import torch\n", + "import pathlib\n", + "import logging\n", + "import pandas as pd\n", + "from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef\n", + "os.chdir(r'D:\\Users\\deu\\Desktop\\latin_macronization\\module')\n", + "from my_tools import *\n", + "from my_dataset import MyDataset\n", + "window = 10\n", + "char_int_mapping, embeddings_matrix = build_char_int_mapping_and_embeddings_matrix(\n", + " 'unmacronized.vector')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f13d1b18", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-20T08:51:12.302865Z", + "start_time": "2022-09-20T08:51:12.297859Z" + } + }, + "outputs": [], + "source": [ + "# 神经网络\n", + "class NN(torch.nn.Module):\n", + " def __init__(self, n_inputs):\n", + " super(NN, self).__init__()\n", + " # 读取词向量\n", + " self.embedding = torch.nn.Embedding.from_pretrained(embeddings_matrix)\n", + " # 读取位置信息\n", + " self.positional_cuda = positional_embedding(window).cuda()\n", + " # 以下各层均使用此初始化函数\n", + " initialize = lambda x: torch.nn.init.kaiming_uniform_(x.weight, nonlinearity='relu')\n", + "\n", + " # 4个dense层\n", + " self.layer_1 = torch.nn.Linear(n_inputs, 128)\n", + " initialize(self.layer_1)\n", + " self.activation_1 = torch.nn.ReLU()\n", + " self.layer_2 = torch.nn.Linear(128, 32)\n", + " initialize(self.layer_2)\n", + " self.activation_2 = torch.nn.ReLU()\n", + " self.layer_3 = torch.nn.Linear(32, 8)\n", + " initialize(self.layer_3)\n", + " self.activation_3 = torch.nn.ReLU()\n", + " self.layer_4 = torch.nn.Linear(8, 1)\n", + " initialize(self.layer_4)\n", + " self.activation_4 = torch.nn.Sigmoid()\n", + "\n", + " self.layers = [\n", + " self.layer_1, self.activation_1,\n", + " self.layer_2, self.activation_2,\n", + " self.layer_3, self.activation_3, self.layer_4, self.activation_4\n", + " ]\n", + "\n", + " def forward(self, X):\n", + " # 嵌入词向量\n", + " X = self.embedding(X)\n", + " # 加上位置信息\n", + " X = X * self.positional_cuda\n", + " # 展平成1维\n", + " X = X.flatten(start_dim=1)\n", + " # 应用每一层\n", + " for layer in self.layers:\n", + " X = layer(X)\n", + " return X" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6a3f6f8b", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-20T08:51:17.720831Z", + "start_time": "2022-09-20T08:51:17.712824Z" + } + }, + "outputs": [], + "source": [ + "# 划分训练集和测试集\n", + "def prepare_data(path):\n", + " trainset, testset = MyDataset(path).get_trainset_and_testset()\n", + " train_dataloader = torch.utils.data.DataLoader(trainset,\n", + " batch_size=32,\n", + " shuffle=True)\n", + " test_dataloader = torch.utils.data.DataLoader(testset,\n", + " batch_size=2048,\n", + " shuffle=False)\n", + " return train_dataloader, test_dataloader\n", + "\n", + "\n", + "# 训练模型\n", + "def train_model(train_dataloader, model):\n", + " calc_loss = torch.nn.BCELoss()\n", + " optim = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)\n", + " for epoch in range(5):\n", + " for i, (X, y) in enumerate(train_dataloader):\n", + " X = X.cuda()\n", + " y = y.cuda()\n", + " optim.zero_grad()\n", + " yhat = model(X)\n", + " loss = calc_loss(yhat, y)\n", + " loss.backward()\n", + " if i % 1000 == 0:\n", + " logging.warning(\n", + " f\"epoch: {epoch}, batch: {i}, loss: {loss.data}\")\n", + " optim.step()\n", + "\n", + "\n", + "# 评估模型\n", + "def evaluate_model(test_dataloader, model):\n", + " preds, actuals = [], []\n", + " for Xs, ys in test_dataloader:\n", + " yhat = model(Xs)\n", + " yhat = yhat.detach().numpy()\n", + " actual = ys.numpy()\n", + " actual = actual.reshape((len(actual), 1))\n", + " yhat = yhat.round()\n", + " preds.append(yhat)\n", + " actuals.append(actual)\n", + " preds, actuals = np.vstack(preds), np.vstack(actuals)\n", + " f1 = f1_score(actuals, preds)\n", + " return f1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59ce7dd6", + "metadata": {}, + "outputs": [], + "source": [ + "# 训练好的模型放置在epoch_5文件夹中\n", + "\n", + "def main(path):\n", + " path = pathlib.Path(path)\n", + " window = 10\n", + " train_dataloader, test_dataloader = prepare_data(path)\n", + " model = NN((2 * window + 1) * embeddings_matrix.shape[1])\n", + " print(model)\n", + "\n", + " # 把模型放到GPU上\n", + " model = model.cuda()\n", + " # 训练模型\n", + " train_model(train_dataloader, model)\n", + "\n", + " # 保存模型并重新读取到CPU上\n", + " save_path = f'{path.stem}.model'\n", + " torch.save(model, save_path)\n", + " model = torch.load(save_path, map_location='cpu')\n", + "\n", + " # 在CPU上验证模型\n", + " f1 = evaluate_model(test_dataloader, model)\n", + " print(f'{path}, f1_score: {f1}')\n", + "\n", + "\n", + "for char in 'yaeiou':\n", + " main(f'{char}.txt')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa6c3927", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-20T07:46:09.953471Z", + "start_time": "2022-09-20T07:37:21.883383Z" + } + }, + "outputs": [], + "source": [ + "# epoch_5文件夹中是我训练了5轮的模型\n", + "\n", + "def choose_threshold(char):\n", + " path = pathlib.Path(f'{char}.txt')\n", + " train_dl, test_dl = prepare_data(path)\n", + " model = torch.load(f'./epoch_5/{char}.model')\n", + " preds, actuals = [], []\n", + " for (Xs, ys) in test_dl:\n", + " yhat = model(Xs.cuda()).cpu().detach().numpy()\n", + " actual = ys.numpy()\n", + " actual = actual.reshape((len(actual), 1))\n", + " preds.append(yhat)\n", + " actuals.append(actual)\n", + " preds, actuals = np.vstack(preds), np.vstack(actuals)\n", + " print(f'{char} macron_ratio: {sum(actuals) / len(actuals)}')\n", + " evaluation = []\n", + " for threshold in np.linspace(0, 1, 100 + 1):\n", + " above_threshold = (preds > threshold).astype('int32')\n", + " result = [\n", + " threshold,\n", + " accuracy_score(actuals, above_threshold),\n", + " f1_score(actuals, above_threshold),\n", + " matthews_corrcoef(actuals, above_threshold)\n", + " ]\n", + " evaluation.append(result)\n", + " evaluation = pd.DataFrame(evaluation,\n", + " columns=['threshold', 'accuracy', 'f1_score', 'matthews_corrcoef'\n", + " ]).sort_values(by=['matthews_corrcoef'],\n", + " axis=0,\n", + " ascending=[False])\n", + " evaluation.to_csv(f'./evaluation/eval_{char}.txt', sep='\\t', index=False)\n", + " return evaluation\n", + "\n", + "for char in 'aeiouy':\n", + " choose_threshold(char)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a22d59e4", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-20T09:09:17.566959Z", + "start_time": "2022-09-20T09:09:17.536942Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "macronized = 'Lēgēs quās Rōmānī fēcērunt tam bonae erant ut nēmō eās violāre vellet.'\n", + "unmacronized = 'Leges quas Romani fecerunt tam bonae erant ut nemo eas violare vellet.'\n", + "remacronized = 'Lēgēs quās Rōmānī fēcērunt tam bonae erant ut nēmō eās violāre vellet.'\n", + "True\n" + ] + } + ], + "source": [ + "import re\n", + "import unicodedata\n", + "\n", + "demacronize = lambda x: ''.join(\n", + " [unicodedata.normalize('NFD', i)[0] for i in x])\n", + "\n", + "# Macronizer能够为一句没有长音的拉丁语句子加上长音\n", + "class Macronizer:\n", + " def __init__(self):\n", + " self.window = 10\n", + " self.macronize = lambda x: unicodedata.normalize('NFC', x + '\\u0304')\n", + " self.pad = lambda x: '#' * self.window + x + '#' * self.window\n", + " self.unpad = lambda x: x[self.window:-self.window]\n", + " # threshold(各元音的softmax最终大于某个阈值时,加上长音)\n", + " # a: 0.56\n", + " # e: 0.43\n", + " # i: 0.46\n", + " # o: 0.59\n", + " # u: 0.52\n", + " # y: 0.19\n", + " self.threshold = {\n", + " 'a': 0.56,\n", + " 'e': 0.43,\n", + " 'i': 0.46,\n", + " 'o': 0.59,\n", + " 'u': 0.52,\n", + " 'y': 0.19\n", + " }\n", + " self.encode = lambda x: [char_int_mapping.get(i, 0) for i in x]\n", + " self.models = {}\n", + " for char in 'aeiouy':\n", + " self.models[char] = torch.load(f'./epoch_5/{char}.model',\n", + " map_location='cpu')\n", + "\n", + " def __call__(self, unmacronized):\n", + " string = self.pad(unmacronized).lower()\n", + " macron_df = []\n", + " for m in re.finditer('[aeiouy]', string):\n", + " center = m.start()\n", + " start = m.start() - window\n", + " end = m.end() + window\n", + " text = string[start:end]\n", + " encoded = self.encode(text)\n", + " macron_df.append([m.group(), center, start, end, text, encoded])\n", + " macron_df = pd.DataFrame(\n", + " macron_df,\n", + " columns=['char', 'center', 'start', 'end', 'text', 'encoded'])\n", + " macron_dfs = {}\n", + " macrons = []\n", + " for char in 'aeiouy':\n", + " if char not in string:\n", + " continue\n", + " macron_dfs[char] = macron_df.query(\"char==@char\").copy()\n", + " macron_dfs[char]['pred'] = self.models[char](torch.tensor(\n", + " macron_dfs[char]['encoded'].tolist())).detach().numpy()\n", + " macron_dfs[char]['macronized'] = macron_dfs[char].apply(\n", + " lambda row: self.macronize(row['char'])\n", + " if row['pred'] > self.threshold[row['char']] else row['char'],\n", + " axis=1)\n", + " macrons.append(macron_dfs[char])\n", + "\n", + " macrons = pd.concat(macrons)\n", + " string_macroned = list(self.pad(unmacronized))\n", + " for row in macrons.itertuples():\n", + " string_macroned[row.center] = row.macronized\n", + " string_macroned = self.unpad(''.join(string_macroned))\n", + " return string_macroned\n", + " \n", + " \n", + "# 测试一句话\n", + "# macronized是已经加好长音的正确的拉丁语句子\n", + "# demacronize的意思是把长音去掉\n", + "# mac再用我们做好的神经网络把长音加上\n", + "mac = Macronizer()\n", + "macronized = 'Lēgēs quās Rōmānī fēcērunt tam bonae erant ut nēmō eās violāre vellet.'\n", + "unmacronized = demacronize(macronized)\n", + "remacronized = mac(unmacronized)\n", + "print(f'{macronized = }')\n", + "print(f'{unmacronized = }')\n", + "print(f'{remacronized = }')\n", + "print(f'{macronized == remacronized}')\n", + "# 完全正确" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "cfd1e159", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-20T09:05:18.531754Z", + "start_time": "2022-09-20T09:05:18.465694Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Celeriter hostes prōgressī sunt ut urbem caperent\n", + "Sed novae cōpiae prōgrediēbantur et illī cōnstituērunt fortiter resistere\n", + "ut hae advenīrent hostēsque vincerent\n", + "sed tandem novae cōpiae advēnērunt et hostes sē recēpēre\n", + "Tanta erat cīvium laetitiā ut in templa sē cōnferrent et dīs multa dōna darent\n" + ] + } + ], + "source": [ + "# 测试一段话\n", + "paragraph = '''Celeriter hostēs prōgressī sunt ut urbem caperent\n", + "Sed novae cōpiae prōgrediēbantur et illī cōnstituērunt fortiter resistere\n", + "ut hae advenīrent hostēsque vincerent\n", + "sed tandem novae cōpiae advēnērunt et hostēs sē recēpēre\n", + "Tanta erat cīvium laetitia ut in templa sē cōnferrent et dīs multa dōna darent'''.splitlines()\n", + "for clause in paragraph:\n", + " print(mac(demacronize(clause)))\n", + "# 仅在laetitia处出错" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e888ba40", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-21T07:25:26.666919Z", + "start_time": "2022-09-21T07:25:26.641896Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
thresholdaccuracyf1_scorematthews_corrcoefchar
00.560.9689930.9489080.926825a
00.430.9904340.9763590.970375e
00.460.9854580.9727760.962858i
00.590.9894690.9900870.978881o
00.520.9928140.9711270.967023u
00.190.9878670.9495110.943089y
\n", + "
" + ], + "text/plain": [ + " threshold accuracy f1_score matthews_corrcoef char\n", + "0 0.56 0.968993 0.948908 0.926825 a\n", + "0 0.43 0.990434 0.976359 0.970375 e\n", + "0 0.46 0.985458 0.972776 0.962858 i\n", + "0 0.59 0.989469 0.990087 0.978881 o\n", + "0 0.52 0.992814 0.971127 0.967023 u\n", + "0 0.19 0.987867 0.949511 0.943089 y" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "os.chdir(r'D:\\Users\\deu\\Desktop\\latin_macronization\\module')\n", + "dfs = []\n", + "for char in 'aeiouy':\n", + " df = pd.read_csv(f'./evaluation/eval_{char}.txt', sep='\\t')\n", + " df['char'] = char\n", + " dfs.append(df)\n", + "dfs = pd.concat(dfs)\n", + "threshold = {'a': 0.56, 'e': 0.43, 'i': 0.46, 'o': 0.59, 'u': 0.52, 'y': 0.19}\n", + "query = ' or '.join([f\"char == '{k}' and threshold == {v}\" for (k,v) in threshold.items()])\n", + "dfs.query(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0c5908c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git "a/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/\350\257\255\346\226\231\346\224\266\351\233\206\345\222\214\350\257\215\345\220\221\351\207\217\350\256\255\347\273\203.ipynb" "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/\350\257\255\346\226\231\346\224\266\351\233\206\345\222\214\350\257\215\345\220\221\351\207\217\350\256\255\347\273\203.ipynb" new file mode 100644 index 0000000000000000000000000000000000000000..bbcf5a479f434b68319075ac8a915d4363566043 --- /dev/null +++ "b/code/2022_autumn/\350\256\270\344\270\200\350\257\272-\346\213\211\344\270\201\350\257\255\351\225\277\351\237\263\350\207\252\345\212\250\346\240\207\346\263\250/\350\257\255\346\226\231\346\224\266\351\233\206\345\222\214\350\257\215\345\220\221\351\207\217\350\256\255\347\273\203.ipynb" @@ -0,0 +1,313 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "982fcb31", + "metadata": {}, + "source": [ + "以下是我提取语料时所用的代码\n", + "对应代码库corpus_sample文件夹的文件\n", + "因为中间经历了很多数据检查步骤,因此下面的代码不是直接可运行的,只能作为做过实验的证据" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e0b8a2f", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-19T00:37:34.983831Z", + "start_time": "2022-09-19T00:37:33.321318Z" + } + }, + "outputs": [], + "source": [ + "import re\n", + "import os\n", + "import pandas as pd\n", + "from glob import glob\n", + "os.chdir(r'G:\\Users\\namentlich\\Desktop\\desktop20160908\\我做的mdx成品\\【72】拉丁图书馆加长音\\newlatin\\www.thelatinlibrary.com')\n", + "files = glob(r'./**/*_plaintext*')\n", + "corpus = []\n", + "for i in files:\n", + " with open(i, encoding='utf8') as f:\n", + " corpus.append([i, re.sub(' ', '\\n', f.read())])\n", + "corpus = pd.DataFrame(corpus, columns=['path', 'text'])\n", + "corpus" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04fa2341", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-19T00:38:04.732894Z", + "start_time": "2022-09-19T00:37:52.621877Z" + } + }, + "outputs": [], + "source": [ + "import re\n", + "import unicodedata\n", + "demacronize = lambda x: ''.join([unicodedata.normalize('NFD', i)[0] for i in x])\n", + "en_words = set(pd.read_csv(r'D:\\Users\\deu\\Desktop\\英语词频\\unigram_freq.csv')['word'].dropna())\n", + "\n", + "def calc_en_ratio(text):\n", + " sample = demacronize(text.lower())\n", + " words = re.findall(r'[a-z]+', sample)\n", + " english_words = [i for i in words if i in en_words]\n", + " return len(english_words) / len(words)\n", + "\n", + "corpus['en_ratio'] = corpus['text'].map(calc_en_ratio)\n", + "corpus" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ed99ba5", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-18T06:13:05.843615Z", + "start_time": "2022-09-18T06:13:00.535287Z" + } + }, + "outputs": [], + "source": [ + "import itertools\n", + "def unnest(nested_list):\n", + " return list(itertools.chain(*nested_list))\n", + "\n", + "def get_sentences(text):\n", + " remove_digit = lambda x: re.sub(r'[\\d]', '', x)\n", + " replace_space = lambda x: re.sub(r' ', '@', x)\n", + " replace_illegal = lambda x: re.sub(r'[^a-zA-ZĀāĒēĪīŌōŪūȲȳ@]', r'\\n', x)\n", + " sentences = [i for i in replace_illegal(replace_space(remove_digit(text))).splitlines() if i]\n", + " # threshold = np.quantile(np.array([len(i) for i in sentences]), 0.1)\n", + " threshold = 10\n", + " sentences = [i for i in sentences if len(i) > threshold]\n", + " return sentences\n", + "\n", + "# corpus['en_ratio'].hist(bins=20)\n", + "latin = corpus.query('en_ratio < 0.65').reset_index(drop=True)\n", + "sentences = unnest(latin['text'].map(get_sentences).tolist())\n", + "sentences = [' '.join(i.lower()) for i in set(list(sentences))]\n", + "with open(r'D:\\Users\\deu\\Desktop\\macron\\latin_corpus.txt', 'w', encoding='utf8') as f:\n", + " f.write('\\n'.join(sentences))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01a34aa9", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-19T00:43:21.505079Z", + "start_time": "2022-09-19T00:43:16.461491Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "os.chdir(r'D:\\Users\\deu\\Desktop\\macron')\n", + "\n", + "char_translate = str.maketrans({\n", + " 'ā': 'a',\n", + " 'ē': 'e',\n", + " 'ī': 'i',\n", + " 'ō': 'o',\n", + " 'ū': 'u',\n", + " 'ȳ': 'y'\n", + " })\n", + "\n", + "with open('latin_corpus 2022091802.txt', encoding='utf8') as f1, open('latin_corpus 2022091901.txt', 'w', encoding='utf8') as f2:\n", + " f2.write(f1.read().translate(char_translate))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b594c8a8", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-19T00:44:39.654588Z", + "start_time": "2022-09-19T00:43:55.669160Z" + } + }, + "outputs": [], + "source": [ + "# 训练词向量\n", + "import os\n", + "import pandas as pd\n", + "import multiprocessing\n", + "os.chdir(r'D:\\Users\\deu\\Desktop\\macron')\n", + "from gensim.models import Word2Vec\n", + "from gensim.models.word2vec import LineSentence\n", + "\n", + "def calc_word_vector(corpus):\n", + " model = Word2Vec(LineSentence(corpus),\n", + " vector_size=16,\n", + " window=5,\n", + " min_count=5,\n", + " workers=multiprocessing.cpu_count(),\n", + " epochs=5)\n", + " model.save('latinchar.model')\n", + " model.wv.save_word2vec_format('unmacronized.vector', binary=False)\n", + " latinchar_vector = pd.read_csv('unmacronized.vector', sep=' ', skiprows=1, header=None)\n", + " return latinchar_vector\n", + "\n", + "latinchar_vector = calc_word_vector('latin_corpus 2022091901.txt')\n", + "latinchar_vector" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b59a364", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-19T00:45:18.817215Z", + "start_time": "2022-09-19T00:45:18.537962Z" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "latinchar_vector = pd.read_csv('unmacronized.vector', sep=' ', skiprows=1, header=None)\n", + "latinchar_vector" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd7f6004", + "metadata": {}, + "outputs": [], + "source": [ + "epoch=2, 23.9s\n", + "epoch=3, 34.2s\n", + "epoch=4, 42.6s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c688da1", + "metadata": { + "ExecuteTime": { + "end_time": "2022-09-18T06:27:41.118107Z", + "start_time": "2022-09-18T06:27:41.113103Z" + } + }, + "outputs": [], + "source": [ + "# 词向量需要多少维度\n", + "N = 30 # 单词表大小\n", + "8.33 * np.log10(N)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0fa6d13", + "metadata": {}, + "outputs": [], + "source": [ + "# 根据语料库构建数据\n", + "def find_all_vowels(corpus):\n", + " def encode_X(row):\n", + " def encode(string):\n", + " return [char_int_mapping.get(i, 0) for i in string]\n", + " return encode(row['pretext_plain']) + encode(row['vowel_plain']) + encode(row['posttext_plain'])\n", + "\n", + " char_translate = str.maketrans({\n", + " 'ā': 'a',\n", + " 'ē': 'e',\n", + " 'ī': 'i',\n", + " 'ō': 'o',\n", + " 'ū': 'u',\n", + " 'ȳ': 'y'\n", + " })\n", + "\n", + " chars = ''.join(char_int_mapping.keys())\n", + " corpus2 = re.sub(' ', '@', re.sub(fr'[^ {chars}]', '#', corpus.lower()))\n", + " corpus3 = corpus2.translate(char_translate)\n", + "\n", + " corpus4 = []\n", + " for m in re.finditer('[aeiouy]', corpus3):\n", + " corpus4.append([\n", + " m.start(), corpus[m.start() - window:m.start()],\n", + " corpus2[m.start() - window:m.start()],\n", + " corpus3[m.start() - window:m.start()], corpus[m.start():m.end()],\n", + " corpus2[m.start():m.end()], corpus3[m.start():m.end()],\n", + " corpus[m.end():m.end() + window], corpus2[m.end():m.end() + window],\n", + " corpus3[m.end():m.end() + window]\n", + " ])\n", + " corpus4 = pd.DataFrame(corpus4,\n", + " columns=[\n", + " 'position', 'pretext_original', 'pretext_macroned',\n", + " 'pretext_plain', 'vowel_original', 'vowel_macroned',\n", + " 'vowel_plain', 'posttext_original',\n", + " 'posttext_macroned', 'posttext_plain'\n", + " ])\n", + " corpus4['y'] = (corpus4['vowel_macroned'] !=\n", + " corpus4['vowel_plain']).astype(int)\n", + "\n", + "\n", + " corpus4['X'] = corpus4.apply(encode_X, axis=1)\n", + " return corpus4" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}