From 644c2e92d5c3146613eaf8d44611922815dd163b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E6=89=BF=E5=90=9B?= <403782415@qq.com> Date: Fri, 12 Jun 2020 12:57:54 +0800 Subject: [PATCH] =?UTF-8?q?add=20python=20=E9=9D=92=E6=98=A5=E6=9C=89?= =?UTF-8?q?=E4=BD=A02=20=E6=95=B0=E6=8D=AE=E5=88=86=E6=9E=90=E9=83=BD?= =?UTF-8?q?=E6=98=AF=E6=BC=82=E4=BA=AE=E7=9A=84=E5=B0=8F=E5=A7=90=E5=A7=90?= =?UTF-8?q?=E5=93=A6.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...0\217\345\247\220\345\247\220\345\223\246" | 205 ++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 "python \351\235\222\346\230\245\346\234\211\344\275\2402 \346\225\260\346\215\256\345\210\206\346\236\220\351\203\275\346\230\257\346\274\202\344\272\256\347\232\204\345\260\217\345\247\220\345\247\220\345\223\246" diff --git "a/python \351\235\222\346\230\245\346\234\211\344\275\2402 \346\225\260\346\215\256\345\210\206\346\236\220\351\203\275\346\230\257\346\274\202\344\272\256\347\232\204\345\260\217\345\247\220\345\247\220\345\223\246" "b/python \351\235\222\346\230\245\346\234\211\344\275\2402 \346\225\260\346\215\256\345\210\206\346\236\220\351\203\275\346\230\257\346\274\202\344\272\256\347\232\204\345\260\217\345\247\220\345\247\220\345\223\246" new file mode 100644 index 0000000..64c018b --- /dev/null +++ "b/python \351\235\222\346\230\245\346\234\211\344\275\2402 \346\225\260\346\215\256\345\210\206\346\236\220\351\203\275\346\230\257\346\274\202\344\272\256\347\232\204\345\260\217\345\247\220\345\247\220\345\223\246" @@ -0,0 +1,205 @@ +# !pip install jieba +# !pip install wordcloud +# Linux系统默认字体文件路径 +# !ls /usr/share/fonts/ +# 查看系统可用的ttf格式中文字体 +!fc-list :lang=zh | grep ".ttf" +# !wget https://mydueros.cdn.bcebos.com/font/simhei.ttf # 下载中文字体 +# #创建字体目录fonts +# !mkdir .fonts +# # 复制字体文件到该路径 +# !cp simhei.ttf .fonts/ +#安装模型 +# !hub install porn_detection_lstm==1.1.0 +# !pip install --upgrade paddlehub +from __future__ import print_function +import requests +import json +import re #正则匹配 +import time #时间处理模块 +import jieba #中文分词 +import numpy as np +import matplotlib +import matplotlib.pyplot as plt +import matplotlib.font_manager as font_manager +from PIL import Image +from wordcloud import WordCloud #绘制词云模块 +import paddlehub as hub +print(matplotlib.matplotlib_fname()) +#请求爱奇艺评论接口,返回response信息 +def getMovieinfo(url): + ''' + 请求爱奇艺评论接口,返回response信息 + 参数 url: 评论的url + :return: response信息 + ''' + session = requests.Session() + headers = { + "User-Agent":"Mozilla/5.0", + "Accept":"application/json", + "Referer":"https://www.iqiyi.com/v_19ryi480ks.html", + "Origin":"https://www.iqiyi.com/", + "Bost":"sns-comment.iqiyi.com", + "Connection":"keep-alive", + "Accept-Language":"zh-CN,zh;q=0.9", + "Accept-Encoding":"gzip,deflate,br" + } + response = session.get(url, headers=headers) + if response.status_code == 200: + return response.text + return None + +#解析json数据,获取评论 +def saveMovieInfoToFile(lastId,arr): + ''' + 解析json数据,获取评论 + 参数 lastId:最后一条评论ID arr:存放文本的list + :return: 新的lastId + ''' + url="https://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118\ + &agent_version=9.11.5&authcookie=null&business_type=17&content_id=15535228800&hot_size=0&last_id=" + url += str(lastId) + responseTxt = getMovieinfo(url) + responseJosn = json.loads(responseTxt) + comments = responseJosn['data']['comments'] + for val in comments: + if 'content' in val.keys(): + #print(val['content']) + arr.append(val['content']) + lastId = str(val['id']) + return lastId + +#去除文本中特殊字符 +def clear_special_char(content): + ''' + 正则处理特殊字符 + 参数 content:原文本 + return: 清除后的文本 + ''' + comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') + return comp.sub('', content) + + +def fenci(text): + ''' + 利用jieba进行分词 + 参数 content:需要分词的句子或文本 + return:分词结果 + ''' + # 加载本地词库 + jieba.load_userdict('work/stopword.txt') + seg = jieba.lcut(text,cut_all=False) + return seg +def stopwordslist(file_path): + ''' + 创建停用词表 + 参数 file_path:停用词文本路径 + return:停用词list + ''' + stopwords = [line.strip() for line in open(file_path,encoding='UTF-8').readlines()] + + return stopwords +def movestopwords(sentence,stopwords,counts): + ''' + 去除停用词,统计词频 + 参数 file_path:停用词文本路径 stopwords:停用词list counts: 词频统计结果 + return:None + ''' + out = [] + + # 去停用词 + for word in sentence: + if word not in stopwords: + if len(word) != 1: + counts[word] = counts.get(word,0) + 1 + + + return None +def drawcounts(counts,num): + ''' + 绘制词频统计表 + 参数 counts: 词频统计结果 num:绘制topN + return:none + ''' + # 词频统计 + + + x_aixs=[] + y_aixs=[] + c_order = sorted(counts.items(),key=lambda x:x[1],reverse=True) + for c in c_order[:num]: + x_aixs.append(c[0]) + y_aixs.append(c[1]) + matplotlib.rcParams['font.sans-serif']=['SimHei'] + matplotlib.rcParams['axes.unicode_minus'] =False + plt.bar(x_aixs,y_aixs) + plt.title('词频统计结果') + plt.show() + + + + +def drawcloud(word_f): + + cloud_mask = np.array(Image.open('xiaomao.png')) + st = set(["东西","这是","啊啊","哦","色色"]) + wc = WordCloud(background_color='white', + mask=cloud_mask, + max_words=150, + font_path="simhei.ttf", + min_font_size=10, + max_font_size=100, + width=400, + relative_scaling=0.3, + stopwords=st) + wc.fit_words(word_f) + wc.to_file('pic.png') + def text_detection(text,file_path): + ''' + 使用hub对评论进行内容分析 + return:分析结果 + + ''' + porn_detection_lstm = hub.Module(name="porn_detection_lstm") + f = open('aqy.txt','r',encoding='utf-8') + for line in f: + if len(line.strip()) == 1: + continue + else: + test_text.append(line) + f.close() + input_dict= {"text":test_text} + results = porn_detection_lstm.detection(data=input_dict,use_gpu=True,batch_size=1) + for index, item in enumerate(results): + if item['porn_detection_key'] == 'porn': + print(item['text'],':',item['porn_probs']) +#评论是多分页的,得多次请求爱奇艺的评论接口才能获取多页评论,有些评论含有表情、特殊字符之类的 +#num 是页数,一页10条评论,假如爬取1000条评论,设置num=100 +if __name__ == "__main__": + num =100 + lastId = '0' + arr=[] + with open('aqy.txt','a',encoding='utf-8')as f: + for i in range(num): + lastId = saveMovieInfoToFile(lastId,arr) + time.sleep(0.5) + for item in arr: + Item = clear_special_char(item) + if Item.strip()!='': + try: + f.write(Item+'\n') + except Exception as e: + print("含有特殊字符") + print('共抓取评论数:',len(arr)) + f = open('aqy.txt','r',encoding='utf-8') + counts={} + for line in f: + words = fenci(line) + stopwords=stopwordslist('work/stopword.txt') + movestopwords(words,stopwords,counts) + drawcounts(counts,10) + drawcloud(counts) + f.close() + file_path = 'aqy.txt' + test_text=[] + text_detection(test_text,file_path) -- Gitee