diff --git a/src/common/Spider.xml b/src/common/Spider.xml
index ca98fcc4b6258de17c12c4de3896a139888ad3b4..7365b86b4c33479c0545a34dde3025bea7dfe41f 100644
--- a/src/common/Spider.xml
+++ b/src/common/Spider.xml
@@ -267,7 +267,7 @@
/topic/show/
-
+
diff --git a/src/spider/BlogPageProcessor.java b/src/spider/BlogPageProcessor.java
index 99802be3d426cd19b3a443cb02456248307183a1..1db07c65881d42b17670a5575785102ba2525caa 100644
--- a/src/spider/BlogPageProcessor.java
+++ b/src/spider/BlogPageProcessor.java
@@ -21,18 +21,18 @@ import us.codecraft.webmagic.processor.PageProcessor;
* @date 20140124
*/
public class BlogPageProcessor implements PageProcessor{
-
+
public class LinkXpath{
public String linksXpath; //链接列表过滤表达式
public String titlesXpath; //title列表过滤表达式
}
-
+
public class ArticleXpath{
public String contentXpath; //内容过滤表达式
public String titleXpath; //title过滤表达式
public String tagsXpath; //tags过滤表达式
}
-
+
private Site site = new Site();
private String url;
private String blogFlag; //博客url的内容标志域
@@ -42,18 +42,18 @@ public class BlogPageProcessor implements PageProcessor{
private List articleXpaths; //获取文件表达式
private List PagelinksRex; //类别页列表过滤表达式
private Hashtable codeHashtable; //代码class映射关系
- private SpiderConfigTool spiderConfig;
+ private SpiderConfigTool spiderConfig;
+
-
private String domain;//当前域名
public BlogPageProcessor(String url) throws Exception{
if(url.endsWith("/")){
url = url.substring(0, url.length()-1);
}
this.url=url;
-
+
String spiderName=""; //切割域名 :类似:csdn.net, 51cto.com, cnblogs.com, iteye.com
-
+
//Pattern p=Pattern.compile("\\.([a-zA-Z0-9]+\\.[a-zA-Z]+)");
Pattern p=Pattern.compile("((?!://)([a-zA-Z0-9-_]+\\.)*[a-zA-Z0-9][a-zA-Z0-9-_]+\\.[a-zA-Z]{2,11})");
Matcher m=p.matcher(url);
@@ -63,16 +63,16 @@ public class BlogPageProcessor implements PageProcessor{
} else {
throw new Exception("不支持的网站!");
}
-
+
spiderConfig = new SpiderConfigTool(spiderName);
-
+
if(spiderConfig == null){
throw new Exception("不支持的网站!");
}
-
+
init();
}
-
+
/**
* 初始化
*/
@@ -83,16 +83,16 @@ public class BlogPageProcessor implements PageProcessor{
String charset = spiderConfig.getSpiderNode().selectSingleNode("charset").getText();
site.setCharset(charset);
site.setSleepTime(1);
-
+
blogFlag = spiderConfig.getSpiderNode().selectSingleNode("blog-flag").getText();
-
+
initPageRex();
initCodeRex();
initLinkXpath();
initArticleXpath();
initCodeHash();
}
-
+
/**
* 初始化 代码替换正则
*/
@@ -100,18 +100,18 @@ public class BlogPageProcessor implements PageProcessor{
private void initCodeRex(){
codeBeginRex = new ArrayList(); //代码过滤正则表达式
codeEndRex = new ArrayList(); //代码过滤正则表达式
-
+
List list = spiderConfig.getSpiderNode().selectNodes("code-begin-rex");
for(Node n:list){
codeBeginRex.add(n.getText());
}
-
+
list = spiderConfig.getSpiderNode().selectNodes("code-end-rex");
for(Node n:list){
codeEndRex.add(n.getText());
}
}
-
+
/**
* 初始化 分页链接
*/
@@ -127,14 +127,14 @@ public class BlogPageProcessor implements PageProcessor{
PagelinksRex.add(temString);
}
}
-
+
/**
* 初始化 获取链接列表xpath
*/
@SuppressWarnings("unchecked")
private void initLinkXpath(){
linkXpaths = new ArrayList(); //获取链接表达式
-
+
List list = spiderConfig.getSpiderNode().selectNodes("link-xpath");
for(Node node : list){
String link = node.selectSingleNode("links-xpath").getText();
@@ -145,14 +145,14 @@ public class BlogPageProcessor implements PageProcessor{
linkXpaths.add(linkXpath);
}
}
-
+
/**
* 初始化 文章规则
*/
@SuppressWarnings("unchecked")
private void initArticleXpath(){
articleXpaths = new ArrayList(); //获取文件表达式
-
+
List list = spiderConfig.getSpiderNode().selectNodes("article-xpath");
for(Node node : list){
String content = node.selectSingleNode("content-xpath").getText();
@@ -162,18 +162,18 @@ public class BlogPageProcessor implements PageProcessor{
articleXpath.contentXpath=content;
articleXpath.titleXpath=title;
articleXpath.tagsXpath = tags;
-
+
articleXpaths.add(articleXpath);
}
}
-
+
/**
* 初始化代码类型映射
*/
@SuppressWarnings("unchecked")
private void initCodeHash(){
codeHashtable = new Hashtable();
-
+
List list = spiderConfig.getSpiderNode().selectNodes("code-hashtable");
for(Node node : list){
String key = node.selectSingleNode("key").getText();
@@ -187,11 +187,11 @@ public class BlogPageProcessor implements PageProcessor{
*/
@Override
public void process(Page page) {
-
+
Pattern p=Pattern.compile(blogFlag);
Matcher m=p.matcher(url);
boolean result=m.find();
-
+
if(result){
getPage(page);
page.putField("getlinks", false);
@@ -200,7 +200,7 @@ public class BlogPageProcessor implements PageProcessor{
page.putField("getlinks", true);
}
}
-
+
/**
* 抓取链接列表
* @param page
@@ -208,35 +208,59 @@ public class BlogPageProcessor implements PageProcessor{
private void getLinks(Page page) {
List links = page.getHtml().xpath(linkXpaths.get(0).linksXpath).all();
List titles = page.getHtml().xpath(linkXpaths.get(0).titlesXpath).all();
-
+
for(int i=1; i < linkXpaths.size() && titles.size() == 0; ++i){
links = page.getHtml().xpath(linkXpaths.get(i).linksXpath).all();
titles = page.getHtml().xpath(linkXpaths.get(i).titlesXpath).all();
}
-
+
page.putField("titles", titles);
page.putField("links", links);
List Pagelinks = page.getHtml().links().regex(PagelinksRex.get(0)).all();
-
+
for(int i=1; i < PagelinksRex.size() && Pagelinks.size() == 0; ++i){
Pagelinks = page.getHtml().links().regex(PagelinksRex.get(i)).all();
}
-
+
if(this.domain.equals("www.jianshu.com")){//关于简书博客爬取的处理方式
String total = page.getHtml().xpath("//div[@class='info']/ul/li[3]/div[@class='meta-block']/a/p/text()").toString();
- int totalArticle = Integer.valueOf(total);
+ int totalArticle = Integer.parseInt(total);
int mod = totalArticle%9;
int p = totalArticle/9;
int totalPage = (mod>0)?p+1:p;
- for (int i = 1; i