From 644c2e92d5c3146613eaf8d44611922815dd163b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E6=89=BF=E5=90=9B?= <403782415@qq.com>
Date: Fri, 12 Jun 2020 12:57:54 +0800
Subject: [PATCH] =?UTF-8?q?add=20python=20=E9=9D=92=E6=98=A5=E6=9C=89?=
 =?UTF-8?q?=E4=BD=A02=20=E6=95=B0=E6=8D=AE=E5=88=86=E6=9E=90=E9=83=BD?=
 =?UTF-8?q?=E6=98=AF=E6=BC=82=E4=BA=AE=E7=9A=84=E5=B0=8F=E5=A7=90=E5=A7=90?=
 =?UTF-8?q?=E5=93=A6.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...0\217\345\247\220\345\247\220\345\223\246" | 205 ++++++++++++++++++
 1 file changed, 205 insertions(+)
 create mode 100644 "python \351\235\222\346\230\245\346\234\211\344\275\2402 \346\225\260\346\215\256\345\210\206\346\236\220\351\203\275\346\230\257\346\274\202\344\272\256\347\232\204\345\260\217\345\247\220\345\247\220\345\223\246"

diff --git "a/python \351\235\222\346\230\245\346\234\211\344\275\2402 \346\225\260\346\215\256\345\210\206\346\236\220\351\203\275\346\230\257\346\274\202\344\272\256\347\232\204\345\260\217\345\247\220\345\247\220\345\223\246" "b/python \351\235\222\346\230\245\346\234\211\344\275\2402 \346\225\260\346\215\256\345\210\206\346\236\220\351\203\275\346\230\257\346\274\202\344\272\256\347\232\204\345\260\217\345\247\220\345\247\220\345\223\246"
new file mode 100644
index 0000000..64c018b
--- /dev/null
+++ "b/python \351\235\222\346\230\245\346\234\211\344\275\2402 \346\225\260\346\215\256\345\210\206\346\236\220\351\203\275\346\230\257\346\274\202\344\272\256\347\232\204\345\260\217\345\247\220\345\247\220\345\223\246"	
@@ -0,0 +1,205 @@
+# !pip install jieba
+# !pip install wordcloud
+# Linux系统默认字体文件路径
+# !ls /usr/share/fonts/
+# 查看系统可用的ttf格式中文字体
+!fc-list :lang=zh | grep ".ttf"
+# !wget https://mydueros.cdn.bcebos.com/font/simhei.ttf # 下载中文字体
+# #创建字体目录fonts
+# !mkdir .fonts
+# # 复制字体文件到该路径
+# !cp simhei.ttf .fonts/
+#安装模型
+# !hub install porn_detection_lstm==1.1.0
+# !pip install --upgrade paddlehub
+from __future__ import print_function
+import requests
+import json
+import re #正则匹配
+import time #时间处理模块
+import jieba #中文分词
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+import matplotlib.font_manager as font_manager
+from PIL import Image
+from wordcloud import WordCloud  #绘制词云模块
+import paddlehub as hub
+print(matplotlib.matplotlib_fname())
+#请求爱奇艺评论接口，返回response信息
+def getMovieinfo(url):
+    '''
+    请求爱奇艺评论接口，返回response信息
+    参数  url: 评论的url
+    :return: response信息
+    '''
+    session = requests.Session()
+    headers = {
+        "User-Agent":"Mozilla/5.0",
+        "Accept":"application/json",
+        "Referer":"https://www.iqiyi.com/v_19ryi480ks.html",
+        "Origin":"https://www.iqiyi.com/",
+        "Bost":"sns-comment.iqiyi.com",
+        "Connection":"keep-alive",
+        "Accept-Language":"zh-CN,zh;q=0.9",
+        "Accept-Encoding":"gzip,deflate,br"
+    }
+    response = session.get(url, headers=headers)
+    if response.status_code == 200:
+        return response.text
+    return None
+
+#解析json数据，获取评论
+def saveMovieInfoToFile(lastId,arr):
+    '''
+    解析json数据，获取评论
+    参数  lastId:最后一条评论ID  arr:存放文本的list
+    :return: 新的lastId
+    '''
+    url="https://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118\
+    &agent_version=9.11.5&authcookie=null&business_type=17&content_id=15535228800&hot_size=0&last_id="
+    url += str(lastId)
+    responseTxt = getMovieinfo(url)
+    responseJosn = json.loads(responseTxt)
+    comments = responseJosn['data']['comments']
+    for val in comments:
+        if 'content' in val.keys():
+            #print(val['content'])
+            arr.append(val['content'])
+        lastId = str(val['id'])
+    return lastId
+
+#去除文本中特殊字符
+def clear_special_char(content):
+    '''
+    正则处理特殊字符
+    参数 content:原文本
+    return: 清除后的文本
+    '''
+    comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
+    return comp.sub('', content)
+    
+
+def fenci(text):
+    '''
+    利用jieba进行分词
+    参数 content:需要分词的句子或文本
+    return：分词结果
+    '''
+    # 加载本地词库
+    jieba.load_userdict('work/stopword.txt')
+    seg = jieba.lcut(text,cut_all=False)
+    return seg
+def stopwordslist(file_path):
+    '''
+    创建停用词表
+    参数 file_path:停用词文本路径
+    return：停用词list
+    '''
+    stopwords = [line.strip() for line in open(file_path,encoding='UTF-8').readlines()]
+    
+    return stopwords
+def movestopwords(sentence,stopwords,counts):
+    '''
+    去除停用词,统计词频
+    参数 file_path:停用词文本路径 stopwords:停用词list counts: 词频统计结果
+    return：None
+    '''
+    out = []
+   
+    # 去停用词
+    for word in sentence:
+        if word not in stopwords:
+            if len(word) != 1:
+                counts[word] = counts.get(word,0) + 1
+               
+   
+    return None
+def drawcounts(counts,num):
+    '''
+    绘制词频统计表
+    参数 counts: 词频统计结果 num:绘制topN
+    return：none
+    '''
+    # 词频统计
+
+    
+    x_aixs=[]
+    y_aixs=[]
+    c_order = sorted(counts.items(),key=lambda x:x[1],reverse=True)
+    for c in c_order[:num]:
+        x_aixs.append(c[0])
+        y_aixs.append(c[1])
+    matplotlib.rcParams['font.sans-serif']=['SimHei']
+    matplotlib.rcParams['axes.unicode_minus'] =False
+    plt.bar(x_aixs,y_aixs)
+    plt.title('词频统计结果')
+    plt.show()
+       
+
+
+  
+def drawcloud(word_f):
+
+    cloud_mask = np.array(Image.open('xiaomao.png'))
+    st = set(["东西","这是","啊啊","哦","色色"])
+    wc = WordCloud(background_color='white',
+    mask=cloud_mask,
+    max_words=150,
+    font_path="simhei.ttf",
+    min_font_size=10,
+    max_font_size=100,
+    width=400,
+    relative_scaling=0.3,
+    stopwords=st)
+    wc.fit_words(word_f)
+    wc.to_file('pic.png')
+    def text_detection(text,file_path):
+    '''
+    使用hub对评论进行内容分析
+    return：分析结果
+
+    '''
+    porn_detection_lstm = hub.Module(name="porn_detection_lstm")
+    f = open('aqy.txt','r',encoding='utf-8')
+    for line in f:
+        if len(line.strip()) == 1:
+            continue
+        else:
+            test_text.append(line)
+    f.close()
+    input_dict= {"text":test_text}
+    results = porn_detection_lstm.detection(data=input_dict,use_gpu=True,batch_size=1)
+    for index, item in enumerate(results):
+        if item['porn_detection_key'] == 'porn':
+            print(item['text'],':',item['porn_probs'])
+#评论是多分页的，得多次请求爱奇艺的评论接口才能获取多页评论,有些评论含有表情、特殊字符之类的
+#num 是页数，一页10条评论，假如爬取1000条评论，设置num=100
+if __name__ == "__main__":
+    num =100
+    lastId = '0'
+    arr=[]
+    with open('aqy.txt','a',encoding='utf-8')as f:
+        for i in range(num):
+            lastId = saveMovieInfoToFile(lastId,arr)
+            time.sleep(0.5)
+        for item in arr:
+            Item = clear_special_char(item)
+            if Item.strip()!='':
+                try:
+                    f.write(Item+'\n')
+                except Exception as e:
+                    print("含有特殊字符")
+    print('共抓取评论数：',len(arr))
+    f = open('aqy.txt','r',encoding='utf-8')
+    counts={}
+    for line in f:
+        words = fenci(line)
+        stopwords=stopwordslist('work/stopword.txt')
+        movestopwords(words,stopwords,counts)
+    drawcounts(counts,10)
+    drawcloud(counts)
+    f.close()
+    file_path = 'aqy.txt'
+    test_text=[]
+    text_detection(test_text,file_path)
-- 
Gitee