diff --git a/src/common/Spider.xml b/src/common/Spider.xml index ca98fcc4b6258de17c12c4de3896a139888ad3b4..7365b86b4c33479c0545a34dde3025bea7dfe41f 100644 --- a/src/common/Spider.xml +++ b/src/common/Spider.xml @@ -267,7 +267,7 @@ /topic/show/ - + diff --git a/src/spider/BlogPageProcessor.java b/src/spider/BlogPageProcessor.java index 99802be3d426cd19b3a443cb02456248307183a1..1db07c65881d42b17670a5575785102ba2525caa 100644 --- a/src/spider/BlogPageProcessor.java +++ b/src/spider/BlogPageProcessor.java @@ -21,18 +21,18 @@ import us.codecraft.webmagic.processor.PageProcessor; * @date 20140124 */ public class BlogPageProcessor implements PageProcessor{ - + public class LinkXpath{ public String linksXpath; //链接列表过滤表达式 public String titlesXpath; //title列表过滤表达式 } - + public class ArticleXpath{ public String contentXpath; //内容过滤表达式 public String titleXpath; //title过滤表达式 public String tagsXpath; //tags过滤表达式 } - + private Site site = new Site(); private String url; private String blogFlag; //博客url的内容标志域 @@ -42,18 +42,18 @@ public class BlogPageProcessor implements PageProcessor{ private List articleXpaths; //获取文件表达式 private List PagelinksRex; //类别页列表过滤表达式 private Hashtable codeHashtable; //代码class映射关系 - private SpiderConfigTool spiderConfig; + private SpiderConfigTool spiderConfig; + - private String domain;//当前域名 public BlogPageProcessor(String url) throws Exception{ if(url.endsWith("/")){ url = url.substring(0, url.length()-1); } this.url=url; - + String spiderName=""; //切割域名 :类似:csdn.net, 51cto.com, cnblogs.com, iteye.com - + //Pattern p=Pattern.compile("\\.([a-zA-Z0-9]+\\.[a-zA-Z]+)"); Pattern p=Pattern.compile("((?!://)([a-zA-Z0-9-_]+\\.)*[a-zA-Z0-9][a-zA-Z0-9-_]+\\.[a-zA-Z]{2,11})"); Matcher m=p.matcher(url); @@ -63,16 +63,16 @@ public class BlogPageProcessor implements PageProcessor{ } else { throw new Exception("不支持的网站!"); } - + spiderConfig = new SpiderConfigTool(spiderName); - + if(spiderConfig == null){ throw new Exception("不支持的网站!"); } - + init(); } - + /** * 初始化 */ @@ -83,16 +83,16 @@ public class BlogPageProcessor implements PageProcessor{ String charset = spiderConfig.getSpiderNode().selectSingleNode("charset").getText(); site.setCharset(charset); site.setSleepTime(1); - + blogFlag = spiderConfig.getSpiderNode().selectSingleNode("blog-flag").getText(); - + initPageRex(); initCodeRex(); initLinkXpath(); initArticleXpath(); initCodeHash(); } - + /** * 初始化 代码替换正则 */ @@ -100,18 +100,18 @@ public class BlogPageProcessor implements PageProcessor{ private void initCodeRex(){ codeBeginRex = new ArrayList(); //代码过滤正则表达式 codeEndRex = new ArrayList(); //代码过滤正则表达式 - + List list = spiderConfig.getSpiderNode().selectNodes("code-begin-rex"); for(Node n:list){ codeBeginRex.add(n.getText()); } - + list = spiderConfig.getSpiderNode().selectNodes("code-end-rex"); for(Node n:list){ codeEndRex.add(n.getText()); } } - + /** * 初始化 分页链接 */ @@ -127,14 +127,14 @@ public class BlogPageProcessor implements PageProcessor{ PagelinksRex.add(temString); } } - + /** * 初始化 获取链接列表xpath */ @SuppressWarnings("unchecked") private void initLinkXpath(){ linkXpaths = new ArrayList(); //获取链接表达式 - + List list = spiderConfig.getSpiderNode().selectNodes("link-xpath"); for(Node node : list){ String link = node.selectSingleNode("links-xpath").getText(); @@ -145,14 +145,14 @@ public class BlogPageProcessor implements PageProcessor{ linkXpaths.add(linkXpath); } } - + /** * 初始化 文章规则 */ @SuppressWarnings("unchecked") private void initArticleXpath(){ articleXpaths = new ArrayList(); //获取文件表达式 - + List list = spiderConfig.getSpiderNode().selectNodes("article-xpath"); for(Node node : list){ String content = node.selectSingleNode("content-xpath").getText(); @@ -162,18 +162,18 @@ public class BlogPageProcessor implements PageProcessor{ articleXpath.contentXpath=content; articleXpath.titleXpath=title; articleXpath.tagsXpath = tags; - + articleXpaths.add(articleXpath); } } - + /** * 初始化代码类型映射 */ @SuppressWarnings("unchecked") private void initCodeHash(){ codeHashtable = new Hashtable(); - + List list = spiderConfig.getSpiderNode().selectNodes("code-hashtable"); for(Node node : list){ String key = node.selectSingleNode("key").getText(); @@ -187,11 +187,11 @@ public class BlogPageProcessor implements PageProcessor{ */ @Override public void process(Page page) { - + Pattern p=Pattern.compile(blogFlag); Matcher m=p.matcher(url); boolean result=m.find(); - + if(result){ getPage(page); page.putField("getlinks", false); @@ -200,7 +200,7 @@ public class BlogPageProcessor implements PageProcessor{ page.putField("getlinks", true); } } - + /** * 抓取链接列表 * @param page @@ -208,35 +208,59 @@ public class BlogPageProcessor implements PageProcessor{ private void getLinks(Page page) { List links = page.getHtml().xpath(linkXpaths.get(0).linksXpath).all(); List titles = page.getHtml().xpath(linkXpaths.get(0).titlesXpath).all(); - + for(int i=1; i < linkXpaths.size() && titles.size() == 0; ++i){ links = page.getHtml().xpath(linkXpaths.get(i).linksXpath).all(); titles = page.getHtml().xpath(linkXpaths.get(i).titlesXpath).all(); } - + page.putField("titles", titles); page.putField("links", links); List Pagelinks = page.getHtml().links().regex(PagelinksRex.get(0)).all(); - + for(int i=1; i < PagelinksRex.size() && Pagelinks.size() == 0; ++i){ Pagelinks = page.getHtml().links().regex(PagelinksRex.get(i)).all(); } - + if(this.domain.equals("www.jianshu.com")){//关于简书博客爬取的处理方式 String total = page.getHtml().xpath("//div[@class='info']/ul/li[3]/div[@class='meta-block']/a/p/text()").toString(); - int totalArticle = Integer.valueOf(total); + int totalArticle = Integer.parseInt(total); int mod = totalArticle%9; int p = totalArticle/9; int totalPage = (mod>0)?p+1:p; - for (int i = 1; i