From fe077c266002b41ca6f2374a08bd1c19c78a3019 Mon Sep 17 00:00:00 2001 From: witt Date: Thu, 5 Mar 2020 05:19:55 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E7=99=BE=E5=BA=A6=E5=BC=80=E5=8F=91?= =?UTF-8?q?=E8=80=85=E5=88=86=E9=A1=B5=E7=88=AC=E5=8F=96=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/spider/BlogPageProcessor.java | 116 +++++++++++++++++++----------- 1 file changed, 74 insertions(+), 42 deletions(-) diff --git a/src/spider/BlogPageProcessor.java b/src/spider/BlogPageProcessor.java index 99802be..0af6811 100644 --- a/src/spider/BlogPageProcessor.java +++ b/src/spider/BlogPageProcessor.java @@ -21,18 +21,18 @@ import us.codecraft.webmagic.processor.PageProcessor; * @date 20140124 */ public class BlogPageProcessor implements PageProcessor{ - + public class LinkXpath{ public String linksXpath; //链接列表过滤表达式 public String titlesXpath; //title列表过滤表达式 } - + public class ArticleXpath{ public String contentXpath; //内容过滤表达式 public String titleXpath; //title过滤表达式 public String tagsXpath; //tags过滤表达式 } - + private Site site = new Site(); private String url; private String blogFlag; //博客url的内容标志域 @@ -42,18 +42,18 @@ public class BlogPageProcessor implements PageProcessor{ private List articleXpaths; //获取文件表达式 private List PagelinksRex; //类别页列表过滤表达式 private Hashtable codeHashtable; //代码class映射关系 - private SpiderConfigTool spiderConfig; + private SpiderConfigTool spiderConfig; + - private String domain;//当前域名 public BlogPageProcessor(String url) throws Exception{ if(url.endsWith("/")){ url = url.substring(0, url.length()-1); } this.url=url; - + String spiderName=""; //切割域名 :类似:csdn.net, 51cto.com, cnblogs.com, iteye.com - + //Pattern p=Pattern.compile("\\.([a-zA-Z0-9]+\\.[a-zA-Z]+)"); Pattern p=Pattern.compile("((?!://)([a-zA-Z0-9-_]+\\.)*[a-zA-Z0-9][a-zA-Z0-9-_]+\\.[a-zA-Z]{2,11})"); Matcher m=p.matcher(url); @@ -63,16 +63,16 @@ public class BlogPageProcessor implements PageProcessor{ } else { throw new Exception("不支持的网站!"); } - + spiderConfig = new SpiderConfigTool(spiderName); - + if(spiderConfig == null){ throw new Exception("不支持的网站!"); } - + init(); } - + /** * 初始化 */ @@ -83,16 +83,16 @@ public class BlogPageProcessor implements PageProcessor{ String charset = spiderConfig.getSpiderNode().selectSingleNode("charset").getText(); site.setCharset(charset); site.setSleepTime(1); - + blogFlag = spiderConfig.getSpiderNode().selectSingleNode("blog-flag").getText(); - + initPageRex(); initCodeRex(); initLinkXpath(); initArticleXpath(); initCodeHash(); } - + /** * 初始化 代码替换正则 */ @@ -100,18 +100,18 @@ public class BlogPageProcessor implements PageProcessor{ private void initCodeRex(){ codeBeginRex = new ArrayList(); //代码过滤正则表达式 codeEndRex = new ArrayList(); //代码过滤正则表达式 - + List list = spiderConfig.getSpiderNode().selectNodes("code-begin-rex"); for(Node n:list){ codeBeginRex.add(n.getText()); } - + list = spiderConfig.getSpiderNode().selectNodes("code-end-rex"); for(Node n:list){ codeEndRex.add(n.getText()); } } - + /** * 初始化 分页链接 */ @@ -127,14 +127,14 @@ public class BlogPageProcessor implements PageProcessor{ PagelinksRex.add(temString); } } - + /** * 初始化 获取链接列表xpath */ @SuppressWarnings("unchecked") private void initLinkXpath(){ linkXpaths = new ArrayList(); //获取链接表达式 - + List list = spiderConfig.getSpiderNode().selectNodes("link-xpath"); for(Node node : list){ String link = node.selectSingleNode("links-xpath").getText(); @@ -145,14 +145,14 @@ public class BlogPageProcessor implements PageProcessor{ linkXpaths.add(linkXpath); } } - + /** * 初始化 文章规则 */ @SuppressWarnings("unchecked") private void initArticleXpath(){ articleXpaths = new ArrayList(); //获取文件表达式 - + List list = spiderConfig.getSpiderNode().selectNodes("article-xpath"); for(Node node : list){ String content = node.selectSingleNode("content-xpath").getText(); @@ -162,18 +162,18 @@ public class BlogPageProcessor implements PageProcessor{ articleXpath.contentXpath=content; articleXpath.titleXpath=title; articleXpath.tagsXpath = tags; - + articleXpaths.add(articleXpath); } } - + /** * 初始化代码类型映射 */ @SuppressWarnings("unchecked") private void initCodeHash(){ codeHashtable = new Hashtable(); - + List list = spiderConfig.getSpiderNode().selectNodes("code-hashtable"); for(Node node : list){ String key = node.selectSingleNode("key").getText(); @@ -187,11 +187,11 @@ public class BlogPageProcessor implements PageProcessor{ */ @Override public void process(Page page) { - + Pattern p=Pattern.compile(blogFlag); Matcher m=p.matcher(url); boolean result=m.find(); - + if(result){ getPage(page); page.putField("getlinks", false); @@ -200,7 +200,7 @@ public class BlogPageProcessor implements PageProcessor{ page.putField("getlinks", true); } } - + /** * 抓取链接列表 * @param page @@ -208,20 +208,20 @@ public class BlogPageProcessor implements PageProcessor{ private void getLinks(Page page) { List links = page.getHtml().xpath(linkXpaths.get(0).linksXpath).all(); List titles = page.getHtml().xpath(linkXpaths.get(0).titlesXpath).all(); - + for(int i=1; i < linkXpaths.size() && titles.size() == 0; ++i){ links = page.getHtml().xpath(linkXpaths.get(i).linksXpath).all(); titles = page.getHtml().xpath(linkXpaths.get(i).titlesXpath).all(); } - + page.putField("titles", titles); page.putField("links", links); List Pagelinks = page.getHtml().links().regex(PagelinksRex.get(0)).all(); - + for(int i=1; i < PagelinksRex.size() && Pagelinks.size() == 0; ++i){ Pagelinks = page.getHtml().links().regex(PagelinksRex.get(i)).all(); } - + if(this.domain.equals("www.jianshu.com")){//关于简书博客爬取的处理方式 String total = page.getHtml().xpath("//div[@class='info']/ul/li[3]/div[@class='meta-block']/a/p/text()").toString(); int totalArticle = Integer.valueOf(total); @@ -232,11 +232,43 @@ public class BlogPageProcessor implements PageProcessor{ String url = PagelinksRex.get(0).replace("\\", ""); page.addTargetRequest(url+ i); } - - }else{ - page.addTargetRequests(Pagelinks); - } - + + } else if (this.domain.equals("developer.baidu.com")) { // 分页爬取逻辑 + // 从 "Ta的文章(126)" 中获取博客总数,计算出分页次数 + String blogTotalInfo = page.getHtml().xpath("//div[@class='uc-left-head-list]/a[1]/div[1]/text()").toString(); + Pattern pattern = Pattern.compile("(?<=()\\d+(? Date: Thu, 5 Mar 2020 11:52:00 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=88=86=E9=A1=B5?= =?UTF-8?q?=E6=8A=93=E5=8F=96=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/common/Spider.xml | 2 +- src/spider/BlogPageProcessor.java | 20 ++++++-------------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/src/common/Spider.xml b/src/common/Spider.xml index ca98fcc..7365b86 100644 --- a/src/common/Spider.xml +++ b/src/common/Spider.xml @@ -267,7 +267,7 @@ /topic/show/ - + diff --git a/src/spider/BlogPageProcessor.java b/src/spider/BlogPageProcessor.java index 0af6811..1db07c6 100644 --- a/src/spider/BlogPageProcessor.java +++ b/src/spider/BlogPageProcessor.java @@ -224,18 +224,18 @@ public class BlogPageProcessor implements PageProcessor{ if(this.domain.equals("www.jianshu.com")){//关于简书博客爬取的处理方式 String total = page.getHtml().xpath("//div[@class='info']/ul/li[3]/div[@class='meta-block']/a/p/text()").toString(); - int totalArticle = Integer.valueOf(total); + int totalArticle = Integer.parseInt(total); int mod = totalArticle%9; int p = totalArticle/9; int totalPage = (mod>0)?p+1:p; - for (int i = 1; i