From ff30fa4e33970420b2125966e54bd3062552fa4d Mon Sep 17 00:00:00 2001 From: Hulk Date: Fri, 3 Apr 2015 12:14:23 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E3=E4=B8=AA=E6=A1=88=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- org.tinygroup.spidersample/pom.xml | 37 ++- .../src/main/java/org/tinygroup/dao/Dao.java | 79 ++++++ .../java/org/tinygroup/entity/Category.java | 62 +++++ .../main/java/org/tinygroup/entity/Joke.java | 44 +++ .../main/java/org/tinygroup/entity/Novel.java | 44 +++ .../org/tinygroup/joke/CategoryProcessor.java | 89 ++++++ .../org/tinygroup/joke/CcontentProcessor.java | 43 +++ .../java/org/tinygroup/joke/JokeMain.java | 27 ++ .../org/tinygroup/joke/PageProcessor.java | 102 +++++++ .../org/tinygroup/novel1/BodyProcessor.java | 35 +++ .../tinygroup/novel1/ContentProcessor.java | 71 +++++ .../tinygroup/novel1/ItemListProcessor.java | 76 +++++ .../java/org/tinygroup/novel1/NovelMain.java | 39 +++ .../org/tinygroup/novel1/TitleProcessor.java | 29 ++ .../org/tinygroup/novel2/BodyProcessor.java | 34 +++ .../tinygroup/novel2/ContentProcessor.java | 71 +++++ .../tinygroup/novel2/ItemListProcessor.java | 69 +++++ .../java/org/tinygroup/novel2/NovelMain.java | 35 +++ .../org/tinygroup/novel2/TitleProcessor.java | 33 +++ .../java/org/tinygroup/utils/JdbcUtils.java | 260 ++++++++++++++++++ .../src/main/resources/log4j.properties | 27 ++ 21 files changed, 1305 insertions(+), 1 deletion(-) create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/dao/Dao.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Category.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Joke.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Novel.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CategoryProcessor.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CcontentProcessor.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/JokeMain.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/PageProcessor.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/BodyProcessor.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ContentProcessor.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ItemListProcessor.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/NovelMain.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/TitleProcessor.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/BodyProcessor.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ContentProcessor.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ItemListProcessor.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/NovelMain.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/TitleProcessor.java create mode 100644 org.tinygroup.spidersample/src/main/java/org/tinygroup/utils/JdbcUtils.java create mode 100644 org.tinygroup.spidersample/src/main/resources/log4j.properties diff --git a/org.tinygroup.spidersample/pom.xml b/org.tinygroup.spidersample/pom.xml index 65b356a..6dd0991 100644 --- a/org.tinygroup.spidersample/pom.xml +++ b/org.tinygroup.spidersample/pom.xml @@ -7,12 +7,47 @@ 1.2.0-SNAPSHOT org.tinygroup.spidersample + + + + 5.1.13 + + org.tinygroup org.tinygroup.spider ${project.version} + + org.tinygroup + org.tinygroup.httpvisit + ${project.version} + + + org.tinygroup + org.tinygroup.htmlparser + ${project.version} + + + org.tinygroup + org.tinygroup.threadgroup + ${project.version} + + + + mysql + mysql-connector-java + ${mysql.driver.version} + runtime + + + + com.alibaba + druid + 1.0.12 + + - + diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/dao/Dao.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/dao/Dao.java new file mode 100644 index 0000000..12a5531 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/dao/Dao.java @@ -0,0 +1,79 @@ +package org.tinygroup.dao; + +import org.jsoup.Jsoup; +import org.tinygroup.entity.Category; +import org.tinygroup.entity.Joke; +import org.tinygroup.entity.Novel; +import org.tinygroup.utils.JdbcUtils; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +/** + * Created by Hulk on 2015/4/1. + */ +public class Dao { + + private JdbcUtils jdbcUtils; + + public Dao() { + jdbcUtils = new JdbcUtils(); +// jdbcUtils = JdbcUtils.getInstance(); + } + + + public Category addCategory(Category category) { + jdbcUtils.getConnection(); + if (category==null){ + return null; + } + String sql = "INSERT INTO `novel_category` (`TITLE`, `PARENT_ID`) VALUES (?,?)"; + List params = new ArrayList(); + params.add(category.getTitle()); + params.add(category.getParentId()); + try { + Integer key = jdbcUtils.insertReturnKey(sql, params); + category.setId(key); + } catch (SQLException e) { + e.printStackTrace(); + }finally { + jdbcUtils.releaseConn(); + } + return category; + } + + public Novel addNovel(Novel novel) { + jdbcUtils.getConnection(); + String sql = "INSERT INTO `novel` (`NOVEL_CATEGORY_ID`, `TITLE`, `CONTENT`) VALUES (?,?,?)"; + List params = new ArrayList(); + params.add(novel.getCategoryId()); + params.add(novel.getTitle()); + params.add(novel.getContent()); + try { + jdbcUtils.insert(sql, params); + } catch (SQLException e) { + e.printStackTrace(); + }finally { + jdbcUtils.releaseConn(); + } + return novel; + } + + public Joke addJoke(Joke joke) { + jdbcUtils.getConnection(); + String sql = "INSERT INTO `novel` (`JOKE_CATEGORY_ID`, `TITLE`, `CONTENT`) VALUES (?,?,?)"; + List params = new ArrayList(); + params.add(joke.getCategoryId()); + params.add(joke.getTitle()); + params.add(joke.getContent()); + try { + jdbcUtils.insert(sql, params); + } catch (SQLException e) { + e.printStackTrace(); + }finally { + jdbcUtils.releaseConn(); + } + return joke; + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Category.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Category.java new file mode 100644 index 0000000..218297e --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Category.java @@ -0,0 +1,62 @@ +package org.tinygroup.entity; + +/** + * 分类 + * Created by Hulk on 2015/4/1. + */ +public class Category { + private Integer id; + private Integer parentId; + private String title; + private String url; + private Integer count; + + public Category() { + } + + public Category(String title, String url, Integer count) { + this.title = title; + this.url = url; + this.count = count; + } + + public Integer getId() { + return id; + } + + public void setId(Integer id) { + this.id = id; + } + + public Integer getParentId() { + return parentId == null ? 0 : parentId; + } + + public void setParentId(Integer parentId) { + this.parentId = parentId; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public Integer getCount() { + return count; + } + + public void setCount(Integer count) { + this.count = count; + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Joke.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Joke.java new file mode 100644 index 0000000..03adc40 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Joke.java @@ -0,0 +1,44 @@ +package org.tinygroup.entity; + +/** + * 笑话 + * Created by Hulk on 2015/4/1. + */ +public class Joke { + private Integer id; + private Integer categoryId; + private String title; + private String content; + + public Integer getId() { + return id; + } + + public void setId(Integer id) { + this.id = id; + } + + public Integer getCategoryId() { + return categoryId; + } + + public void setCategoryId(Integer categoryId) { + this.categoryId = categoryId; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Novel.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Novel.java new file mode 100644 index 0000000..6943b24 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Novel.java @@ -0,0 +1,44 @@ +package org.tinygroup.entity; + +/** + * 笑话 + * Created by Hulk on 2015/4/1. + */ +public class Novel { + private Integer id; + private Integer categoryId; + private String title; + private String content; + + public Integer getId() { + return id; + } + + public void setId(Integer id) { + this.id = id; + } + + public Integer getCategoryId() { + return categoryId; + } + + public void setCategoryId(Integer categoryId) { + this.categoryId = categoryId; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CategoryProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CategoryProcessor.java new file mode 100644 index 0000000..f4ff051 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CategoryProcessor.java @@ -0,0 +1,89 @@ +package org.tinygroup.joke; + +import org.tinygroup.dao.Dao; +import org.tinygroup.entity.Category; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.FastNameFilter; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.threadgroup.AbstractProcessor; +import org.tinygroup.tinyspider.Processor; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * 获取栏目分类内容 + * Created by Hulk on 2015/4/1. + */ +public class CategoryProcessor implements Processor { + private Dao dao = new Dao(); + // 线程池的容量 + private static final int POOL_SIZE = 20; + // 线程池 + ExecutorService exe = Executors.newFixedThreadPool(POOL_SIZE); + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + FastNameFilter filter = new FastNameFilter(node); + filter.setNodeName("a"); + List aList = filter.findNodeList(); +// MultiThreadProcessor processCategory = new MultiThreadProcessor("DownCategoryMultiThread"); + for (HtmlNode a : aList) { + String href = a.getAttribute("href"); + System.out.println(a.getContent() + "栏目url:" + href); + Category category = new Category(); + category.setTitle(a.getContent()); + category.setUrl(href); + dao.addCategory(category); +// processCategory.addProcessor(new DownCategoryProcessor(category)); + exe.execute(new DownCategoryProcessor(category)); + } + exe.shutdown(); +// processCategory.start(); +// processCategory.threadDone(); + } + + class DownCategoryProcessor extends AbstractProcessor implements Runnable{ + private Category category; + + public DownCategoryProcessor(Category category) { + super(category.getTitle() + "Processor"); + this.category = category; + } + + @Override + public void run() { + try { + action(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + protected void action() throws Exception { + long start = System.currentTimeMillis(); + Spider spider = new SpiderImpl("gbk"); + Watcher watcher = new WatcherImpl(); + watcher.addProcessor(new org.tinygroup.joke.PageProcessor(category)); + QuickNameFilter nodeFilter = new QuickNameFilter(); + nodeFilter.setNodeName("p"); + nodeFilter.setIncludeAttribute("id", "pages"); + watcher.setNodeFilter(nodeFilter); + spider.addWatcher(watcher); + try { + spider.processUrl(JokeMain.host + category.getUrl()); + } catch (Exception e) { + e.printStackTrace(); + } + long end = System.currentTimeMillis(); + System.out.println(category.getTitle() + "finished 耗时(ms):" + (end - start)); + + } + } + +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CcontentProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CcontentProcessor.java new file mode 100644 index 0000000..8cd63ed --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CcontentProcessor.java @@ -0,0 +1,43 @@ +package org.tinygroup.joke; + +import org.tinygroup.dao.Dao; +import org.tinygroup.entity.Category; +import org.tinygroup.entity.Joke; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.FastNameFilter; +import org.tinygroup.tinyspider.Processor; + +import java.util.Map; + +/** + * 获取当前页面内容 + * Created by Hulk on 2015/4/1. + */ +public class CcontentProcessor implements Processor { + private Category category; + private Dao dao = new Dao(); + + public CcontentProcessor(Category category) { + this.category = category; + } + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + FastNameFilter filter = new FastNameFilter(node); + //获取标题 + filter.setNodeName("h3"); + HtmlNode h3 = filter.findNode(); + if (h3 == null) { + return; + } + String title = h3.getSubNode("a").getContent(); + //获取正文内容 + filter.setNodeName("div").setIncludeAttribute("id", "endtext"); + String content = filter.findNode().getBody().toString(); + Joke joke = new Joke(); + joke.setTitle(title); + joke.setCategoryId(category.getId()); + joke.setContent(content); + dao.addJoke(joke); + } + +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/JokeMain.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/JokeMain.java new file mode 100644 index 0000000..8958d65 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/JokeMain.java @@ -0,0 +1,27 @@ +package org.tinygroup.joke; + +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +/** + * Created by Hulk on 2015/4/1. + */ +public class JokeMain { + static String host = "http://www.haha365.com"; + + public static void main(String[] args) throws Exception { + Spider spider = new SpiderImpl("gbk"); + Watcher watcher = new WatcherImpl(); + watcher.addProcessor(new CategoryProcessor()); + QuickNameFilter nodeFilter = new QuickNameFilter(); + nodeFilter.setNodeName("ul"); + nodeFilter.setIncludeAttribute("class", "cat_llb3"); + watcher.setNodeFilter(nodeFilter); + spider.addWatcher(watcher); + spider.processUrl(host + "/joke/"); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/PageProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/PageProcessor.java new file mode 100644 index 0000000..efd1dba --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/PageProcessor.java @@ -0,0 +1,102 @@ +package org.tinygroup.joke; + +import org.tinygroup.entity.Category; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.threadgroup.AbstractProcessor; +import org.tinygroup.threadgroup.MultiThreadProcessor; +import org.tinygroup.tinyspider.Processor; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * 获取当前分类页数 + * Created by Hulk on 2015/4/1. + */ +public class PageProcessor implements Processor { + private Category category; + // 线程池的容量 + private static final int POOL_SIZE = 3; + // 线程池 + ExecutorService exe = Executors.newFixedThreadPool(POOL_SIZE); + + public PageProcessor(Category category) { + this.category = category; + } + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + StringBuffer body = node.getBody(); + String pageCount = body.substring(body.indexOf("(共") + 2, body.indexOf("页)")); + Integer count = Integer.valueOf(pageCount); + System.out.println(category.getTitle() + "总页数===" + count); +// MultiThreadProcessor processPage = new MultiThreadProcessor("DownPageMultiThread"); + for (int i = 1; i <= count; i++) { + exe.execute(new DownPageProcessor(i)); +// processPage.addProcessor(new DownPageProcessor(i)); + } +// processPage.start(); +// processPage.threadDone(); + exe.shutdown(); + + } + +// public void getContent(Integer count, Category category) throws Exception { +// Spider spider = new SpiderImpl("gbk"); +// Watcher watcher = new WatcherImpl(); +// watcher.addProcessor(new CcontentProcessor(category)); +// QuickNameFilter nodeFilter = new QuickNameFilter(); +// nodeFilter.setNodeName("div"); +// nodeFilter.setIncludeAttribute("class", "r_c"); +// watcher.setNodeFilter(nodeFilter); +// spider.addWatcher(watcher); +// MultiThreadProcessor processPage = new MultiThreadProcessor("DownPageMultiThread"); +// +// for (int i = 1; i <= count; i++) { +// processPage.addProcessor(new DownPageProcessor()); +// spider.processUrl(JokeMain.host + category.getUrl() + "index_" + i + ".htm"); +// System.out.println(category.getUrl() + "当页面完成page=" + i); +// } +// } + + + class DownPageProcessor extends AbstractProcessor implements Runnable { + private int pageNo; + + public DownPageProcessor(int pageNo) { + super(category.getTitle() + " pageNo" + pageNo + " Processor"); + this.pageNo = pageNo; + } + + public void run() { + try { + action(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + protected void action() throws Exception { +// Thread.sleep(10000); + long start = System.currentTimeMillis(); + Spider spider = new SpiderImpl("gbk"); + Watcher watcher = new WatcherImpl(); + watcher.addProcessor(new CcontentProcessor(category)); + QuickNameFilter nodeFilter = new QuickNameFilter(); + nodeFilter.setNodeName("div"); + nodeFilter.setIncludeAttribute("class", "r_c"); + watcher.setNodeFilter(nodeFilter); + spider.addWatcher(watcher); + spider.processUrl(JokeMain.host + category.getUrl() + "index_" + pageNo + ".htm"); + long end = System.currentTimeMillis(); + System.out.println("\t\t" + category.getTitle() + "finished pageNO=" + pageNo + "耗时(ms):" + (end - start)); + } + } + + +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/BodyProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/BodyProcessor.java new file mode 100644 index 0000000..7e3f45b --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/BodyProcessor.java @@ -0,0 +1,35 @@ +package org.tinygroup.novel1; + +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.FastNameFilter; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.tinyspider.Processor; + +import java.util.List; +import java.util.Map; + +/** + * Created by Hulk on 2015/4/2. + */ +public class BodyProcessor implements Processor { + private String body; + + public String getBody() { + return body; + } + + public void setBody(String body) { + this.body = body; + } + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + FastNameFilter filter = new FastNameFilter(node); + filter.setNodeName("p"); + HtmlNode p = filter.findNode(); + if (p == null) { + return; + } + setBody(p.toString()); +// System.out.println(body); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ContentProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ContentProcessor.java new file mode 100644 index 0000000..ef43923 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ContentProcessor.java @@ -0,0 +1,71 @@ +package org.tinygroup.novel1; + +import org.tinygroup.dao.Dao; +import org.tinygroup.entity.Category; +import org.tinygroup.entity.Novel; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +/** + * 获取当前页面内容 + * Created by Hulk on 2015/4/1. + */ +public class ContentProcessor extends Thread { + private Category category; + private String url; + private Dao dao = new Dao(); + + public ContentProcessor(Category category, String url) { + this.category = category; + this.url = url; + } + + @Override + public void run() { + super.run(); + long start = System.currentTimeMillis(); + Spider spider = new SpiderImpl("gbk"); + //获取标题 + QuickNameFilter titleNodeFilter = new QuickNameFilter(); + titleNodeFilter.setNodeName("div"); + titleNodeFilter.setIncludeAttribute("class", "title"); + TitleProcessor titleProcessor= new TitleProcessor(); + + Watcher titleWatcher = new WatcherImpl() + .processor(titleProcessor) + .nodeFilter(titleNodeFilter); + spider.addWatcher(titleWatcher); + + //获取正文 + BodyProcessor bodyProcessor = new BodyProcessor(); + QuickNameFilter bodyNodeFilter = new QuickNameFilter(); + bodyNodeFilter.setNodeName("div"); + bodyNodeFilter.setIncludeAttribute("class", "content"); + + Watcher bodyWatcher = new WatcherImpl() + .processor(bodyProcessor) + .nodeFilter(bodyNodeFilter); + spider.addWatcher(bodyWatcher); + + try { + spider.processUrl(url); + Novel novel = new Novel(); + novel.setCategoryId(category.getId()); + if (titleProcessor.getTitle()==null || bodyProcessor.getBody()==null){ + return; + } + novel.setContent(bodyProcessor.getBody()); + novel.setTitle(titleProcessor.getTitle()); + dao.addNovel(novel); + long end = System.currentTimeMillis(); + System.out.println("\t\t"+category.getTitle() + "title=" + novel.getTitle() + " get content finished 耗时(ms):" + (end - start)); + } catch (Exception e) { + e.printStackTrace(); + } + + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ItemListProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ItemListProcessor.java new file mode 100644 index 0000000..0d68760 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ItemListProcessor.java @@ -0,0 +1,76 @@ +package org.tinygroup.novel1; + +import org.tinygroup.entity.Category; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.FastNameFilter; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.tinyspider.Processor; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Created by Hulk on 2015/4/2. + */ +public class ItemListProcessor extends Thread implements Processor { + private Category category; + private int pageNo; + // 线程池的容量 + private static final int POOL_SIZE = 10; + // 线程池 + ExecutorService exe = Executors.newFixedThreadPool(POOL_SIZE); + + public ItemListProcessor(Category category, int pageNo) { + if (category==null){ + return; + } + this.category = category; + this.pageNo = pageNo; + } + + @Override + public void run() { + long start = System.currentTimeMillis(); + Spider spider = new SpiderImpl("gbk"); + Watcher watcher = new WatcherImpl(); + watcher.addProcessor(this); + QuickNameFilter nodeFilter = new QuickNameFilter(); + nodeFilter.setNodeName("ul"); + nodeFilter.setIncludeAttribute("class", "e2"); + watcher.setNodeFilter(nodeFilter); + spider.addWatcher(watcher); + try { + spider.processUrl(NovelMain.host + category.getUrl() + pageNo + ".html"); + } catch (Exception e) { + e.printStackTrace(); + } + long end = System.currentTimeMillis(); + System.out.println(category.getTitle() + "pageNo=" + pageNo + " get items finished 耗时(ms):" + (end - start)); + } + + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + FastNameFilter filter = new FastNameFilter(node); + //获取标题 + filter.setNodeName("h3").setIncludeAttribute("class", "title"); + List h3List = filter.findNodeList(); + for (HtmlNode htmlNode : h3List) { + HtmlNode a = htmlNode.getSubNode("a"); + if (a == null) { + return; + } + String contentLink = a.getAttribute("href"); + if (category==null){ + return; + } + exe.execute(new ContentProcessor(category, contentLink)); + } + exe.shutdown(); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/NovelMain.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/NovelMain.java new file mode 100644 index 0000000..f527d52 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/NovelMain.java @@ -0,0 +1,39 @@ +package org.tinygroup.novel1; + +import org.tinygroup.dao.Dao; +import org.tinygroup.entity.Category; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Created by Hulk on 2015/4/1. + */ +public class NovelMain { + static String host = "http://www.jj59.com/xiaoshuo"; + static Dao dao = new Dao(); + // 线程池的容量 + static final int POOL_SIZE = 5; + // 线程池 + static ExecutorService exe = Executors.newFixedThreadPool(POOL_SIZE); + + public static void main(String[] args) throws Exception { + + List list = new ArrayList(); + list.add(new Category("弹指江湖", "/qingchunxiaoyuan/list_107_", 42)); + list.add(new Category("爱情小说", "/aiqingxiaoshuo/list_105_", 100)); + list.add(new Category("故事新编", "/gushixinbian/list_106_", 21)); + list.add(new Category("青春校园 ", "/qingchunxiaoyuan/list_107_", 68)); + list.add(new Category("百味人生", "/baiweirensheng/list_108_", 79)); + list.add(new Category("都市言情 ", "/dushiyanqing/list_123_", 4)); + for (Category category : list) { + dao.addCategory(category); + for (int i = 1; i <= category.getCount(); i++) { + exe.execute(new ItemListProcessor(category,i)); + } + } + exe.shutdown(); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/TitleProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/TitleProcessor.java new file mode 100644 index 0000000..7885920 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/TitleProcessor.java @@ -0,0 +1,29 @@ +package org.tinygroup.novel1; + +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.tinyspider.Processor; + +import java.util.Map; + +/** + * Created by Hulk on 2015/4/2. + */ +public class TitleProcessor implements Processor { + private String title; + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + HtmlNode h2 = node.getSubNode("h2"); + if (h2 == null) { + return; + } + setTitle(h2.getBody().toString()); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/BodyProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/BodyProcessor.java new file mode 100644 index 0000000..5d859a5 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/BodyProcessor.java @@ -0,0 +1,34 @@ +package org.tinygroup.novel2; + +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.FastNameFilter; +import org.tinygroup.tinyspider.Processor; + +import java.util.Map; + +/** + * Created by Hulk on 2015/4/2. + */ +public class BodyProcessor implements Processor { + private String body; + + public String getBody() { + return body; + } + + public void setBody(String body) { + this.body = body; + } + + public void process(String url, HtmlNode node, Map parameters) throws Exception { +// FastNameFilter filter = new FastNameFilter(node); +// filter.setNodeName("p"); +// HtmlNode p = filter.findNode(); +// if (p == null) { +// return; +// } + String body = node.getBody().toString(); + setBody(body); +// System.out.println(body); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ContentProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ContentProcessor.java new file mode 100644 index 0000000..8f59c5d --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ContentProcessor.java @@ -0,0 +1,71 @@ +package org.tinygroup.novel2; + +import org.tinygroup.dao.Dao; +import org.tinygroup.entity.Category; +import org.tinygroup.entity.Novel; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +/** + * 获取当前页面内容 + * Created by Hulk on 2015/4/1. + */ +public class ContentProcessor extends Thread { + private Category category; + private String url; + private Dao dao = new Dao(); + + public ContentProcessor(Category category, String url) { + this.category = category; + this.url = url; + } + + @Override + public void run() { + super.run(); + long start = System.currentTimeMillis(); + Spider spider = new SpiderImpl("gbk"); + //获取标题 + QuickNameFilter titleNodeFilter = new QuickNameFilter(); + titleNodeFilter.setNodeName("td"); + titleNodeFilter.setIncludeAttribute("class", "fox2008_cntitle"); + TitleProcessor titleProcessor= new TitleProcessor(); + + Watcher titleWatcher = new WatcherImpl() + .processor(titleProcessor) + .nodeFilter(titleNodeFilter); + spider.addWatcher(titleWatcher); + + //获取正文 + BodyProcessor bodyProcessor = new BodyProcessor(); + QuickNameFilter bodyNodeFilter = new QuickNameFilter(); + bodyNodeFilter.setNodeName("td"); + bodyNodeFilter.setIncludeAttribute("class", "fox2008_cnpicontent"); + + Watcher bodyWatcher = new WatcherImpl() + .processor(bodyProcessor) + .nodeFilter(bodyNodeFilter); + spider.addWatcher(bodyWatcher); + + try { + spider.processUrl(url); + Novel novel = new Novel(); + novel.setCategoryId(category.getId()); + if (titleProcessor.getTitle()==null || bodyProcessor.getBody()==null){ + return; + } + novel.setContent(bodyProcessor.getBody()); + novel.setTitle(titleProcessor.getTitle()); + dao.addNovel(novel); + long end = System.currentTimeMillis(); + System.out.println("\t\t"+category.getTitle() + "title=" + novel.getTitle() + " get content finished 耗时(ms):" + (end - start)); + } catch (Exception e) { + e.printStackTrace(); + } + + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ItemListProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ItemListProcessor.java new file mode 100644 index 0000000..bbe4a32 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ItemListProcessor.java @@ -0,0 +1,69 @@ +package org.tinygroup.novel2; + +import org.tinygroup.entity.Category; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.FastNameFilter; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.tinyspider.Processor; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Created by Hulk on 2015/4/2. + */ +public class ItemListProcessor extends Thread implements Processor { + private Category category; + private int pageNo; + // 线程池的容量 + private static final int POOL_SIZE = 15; + // 线程池 + ExecutorService exe = Executors.newFixedThreadPool(POOL_SIZE); + + public ItemListProcessor(Category category, int pageNo) { + if (category==null){ + return; + } + this.category = category; + this.pageNo = pageNo; + } + + @Override + public void run() { + long start = System.currentTimeMillis(); + Spider spider = new SpiderImpl("gbk"); + Watcher watcher = new WatcherImpl(); + watcher.addProcessor(this); + QuickNameFilter nodeFilter = new QuickNameFilter(); + nodeFilter.setNodeName("td"); + nodeFilter.setIncludeAttribute("class", "listdlmid"); + watcher.setNodeFilter(nodeFilter); + spider.addWatcher(watcher); + try { + spider.processUrl(NovelMain.host + category.getUrl() + pageNo + ".html"); + } catch (Exception e) { + e.printStackTrace(); + } + long end = System.currentTimeMillis(); + System.out.println(category.getTitle() + "pageNo=" + pageNo + " get items finished 耗时(ms):" + (end - start)); + } + + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + FastNameFilter filter = new FastNameFilter(node); + //获取标题 + filter.setNodeName("a").setIncludeAttribute("class", ""); + List aList = filter.findNodeList(); + for (HtmlNode a : aList) { + String contentLink = a.getAttribute("href"); + exe.execute(new ContentProcessor(category, NovelMain.host+contentLink)); + } + exe.shutdown(); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/NovelMain.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/NovelMain.java new file mode 100644 index 0000000..40f5aeb --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/NovelMain.java @@ -0,0 +1,35 @@ +package org.tinygroup.novel2; + +import org.tinygroup.dao.Dao; +import org.tinygroup.entity.Category; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Created by Hulk on 2015/4/1. + */ +public class NovelMain { + static String host = "http://www.fox2008.cn"; + static Dao dao = new Dao(); + // 线程池的容量 + static final int POOL_SIZE = 5; + // 线程池 + static ExecutorService exe = Executors.newFixedThreadPool(POOL_SIZE); + + public static void main(String[] args) throws Exception { + + List list = new ArrayList(); + list.add(new Category("小小说精选", "/Article/ShowClass.asp?ClassID=1072&page=", 24)); + list.add(new Category("中外微型小说", "/Article/ShowClass.asp?ClassID=1297&page=", 7)); + for (Category category : list) { + dao.addCategory(category); + for (int i = 1; i <= category.getCount(); i++) { + exe.execute(new ItemListProcessor(category,i)); + } + } + exe.shutdown(); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/TitleProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/TitleProcessor.java new file mode 100644 index 0000000..924d0d8 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/TitleProcessor.java @@ -0,0 +1,33 @@ +package org.tinygroup.novel2; + +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.tinyspider.Processor; + +import java.util.Map; + +/** + * Created by Hulk on 2015/4/2. + */ +public class TitleProcessor implements Processor { + private String title; + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + String title = node.getPureText(); + if (title == null) { + return; + } + if (title.indexOf("(")>0){ + setTitle(title.substring(0, title.indexOf("("))); + }else { + setTitle(title); + } + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/utils/JdbcUtils.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/utils/JdbcUtils.java new file mode 100644 index 0000000..3571ee6 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/utils/JdbcUtils.java @@ -0,0 +1,260 @@ +package org.tinygroup.utils; + +import com.alibaba.druid.pool.DruidDataSource; + +import java.lang.reflect.Field; +import java.sql.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Created by Hulk on 2015/4/1. + */ +public class JdbcUtils { + private static JdbcUtils ins; + // 表示定义数据库的用户名 + private final static String USERNAME = "root"; + // 定义数据库的密码 + private final static String PASSWORD = "hdu123"; + // 定义数据库的驱动信息 + private final static String DRIVER = "com.mysql.jdbc.Driver"; + // 定义访问数据库的地址 + private final static String URL = "jdbc:mysql://localhost:3306/tiny?characterEncoding=UTF-8"; + // 定义数据库的链接 + private Connection connection; + // 定义sql语句的执行对象 + private PreparedStatement pstmt; + // 定义查询返回的结果集合 + private ResultSet resultSet; + // 实现批处理操作的功能 + private Statement stmt; + + private static DruidDataSource dataSource; + + static { + dataSource = new DruidDataSource(); + dataSource.setDriverClassName(DRIVER); + dataSource.setUrl(URL); + dataSource.setPoolPreparedStatements(true); + dataSource.setUsername(USERNAME); + dataSource.setPassword(PASSWORD); + } + +// public JdbcUtils() { +// try { +//// Class.forName(DRIVER); +// System.out.println("注册驱动成功!!"); +// } catch (Exception e) { +// e.printStackTrace(); +// } +// } + + public static JdbcUtils getInstance() { + if (ins == null) { + ins = new JdbcUtils(); + ins.getConnection(); + } + return ins; + } + + + // 定义获得数据库的链接 + public Connection getConnection() { + try { + connection = dataSource.getConnection(); + } catch (Exception e) { + e.printStackTrace(); + } + return connection; + } + + public Integer insert(String sql, List params) throws SQLException { + pstmt = connection.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS); + int index = 1; + if (params != null && !params.isEmpty()) { + for (int i = 0; i < params.size(); i++) { + pstmt.setObject(index++, params.get(i)); + } + } + return pstmt.executeUpdate(); + } + + public Integer insertReturnKey(String sql, List params) throws SQLException { + pstmt = connection.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS); + int index = 1; + if (params != null && !params.isEmpty()) { + for (int i = 0; i < params.size(); i++) { + pstmt.setObject(index++, params.get(i)); + } + } + int result = pstmt.executeUpdate(); + resultSet = pstmt.getGeneratedKeys();// 返回主键 + Integer key = null; + if (result != 0 && resultSet.next()) { + key = resultSet.getInt(result); + } + return key; + } + + /** + * 完成对数据库的表的添加删除和修改的操作 + * + * @param sql + * @param params + * @return + * @throws SQLException + */ + public boolean updateByPreparedStatement(String sql, List params) + throws SQLException { + boolean flag = false; + int result = -1;// 表示当用户执行添加删除和修改的时候所影响数据库的行数 + pstmt = connection.prepareStatement(sql); + int index = 1; + if (params != null && !params.isEmpty()) { + for (int i = 0; i < params.size(); i++) { + pstmt.setObject(index++, params.get(i)); + } + } + result = pstmt.executeUpdate(); + flag = result > 0 ? true : false; + return flag; + } + + /** + * 查询返回单条记录 + * + * @param sql + * @param params + * @return + * @throws SQLException + */ + public Map findSimpleResult(String sql, List params) + throws SQLException { + Map map = new HashMap(); + int index = 1; + pstmt = connection.prepareStatement(sql); + if (params != null && !params.isEmpty()) { + for (int i = 0; i < params.size(); i++) { + pstmt.setObject(index++, params.get(i)); + } + } + resultSet = pstmt.executeQuery();// 返回查询结果 + ResultSetMetaData metaData = resultSet.getMetaData(); + int col_len = metaData.getColumnCount();// 获得列的名称 + while (resultSet.next()) { + for (int i = 0; i < col_len; i++) { + String cols_name = metaData.getColumnName(i + 1); + Object cols_value = resultSet.getObject(cols_name); + if (cols_value == null) { + cols_value = ""; + } + map.put(cols_name, cols_value); + } + } + return map; + } + + // jdbc的封装可以用反射机制来封装 + public T findSimpleRefResult(String sql, List params, + Class cls) throws Exception { + T resultObject = null; + int index = 1; + pstmt = connection.prepareStatement(sql); + if (params != null && !params.isEmpty()) { + for (int i = 0; i < params.size(); i++) { + pstmt.setObject(index++, params.get(i)); + } + } + resultSet = pstmt.executeQuery(); + ResultSetMetaData metaData = resultSet.getMetaData(); + int cols_len = metaData.getColumnCount(); + while (resultSet.next()) { + //通过反射机制创建实例 + resultObject = cls.newInstance(); + for (int i = 0; i < cols_len; i++) { + String cols_name = metaData.getColumnName(i + 1); + Object cols_value = resultSet.getObject(cols_name); + if (cols_value == null) { + cols_value = ""; + } + Field field = cls.getDeclaredField(cols_name); + field.setAccessible(true);// 打开javabean的访问private权限 + field.set(resultObject, cols_value); + } + } + return resultObject; + } + + /** + * 通过反射机制访问数据库 + * + * @param + * @param sql + * @param params + * @param cls + * @return + * @throws Exception + */ + public List findMoreRefResult(String sql, List params, + Class cls) throws Exception { + List list = new ArrayList(); + int index = 1; + pstmt = connection.prepareStatement(sql); + if (params != null && !params.isEmpty()) { + for (int i = 0; i < params.size(); i++) { + pstmt.setObject(index++, params.get(i)); + } + } + resultSet = pstmt.executeQuery(); + ResultSetMetaData metaData = resultSet.getMetaData(); + int cols_len = metaData.getColumnCount(); + while (resultSet.next()) { + T resultObject = cls.newInstance(); + for (int i = 0; i < cols_len; i++) { + String cols_name = metaData.getColumnName(i + 1); + Object cols_value = resultSet.getObject(cols_name); + if (cols_value == null) { + cols_value = ""; + } + Field field = cls.getDeclaredField(cols_name); + field.setAccessible(true); + field.set(resultObject, cols_value); + } + list.add(resultObject); + } + return list; + } + + public void releaseConn() { + if (resultSet != null) { + try { + resultSet.close(); + } catch (SQLException e) { + e.printStackTrace(); + } + } + if (stmt != null) { + try { + stmt.close(); + } catch (SQLException e) { + e.printStackTrace(); + } + } + if (pstmt != null) { + try { + pstmt.close(); + } catch (SQLException e) { + e.printStackTrace(); + } + } + if (connection != null) { + try { + connection.close(); + } catch (SQLException e) { + e.printStackTrace(); + } + } + } +} diff --git a/org.tinygroup.spidersample/src/main/resources/log4j.properties b/org.tinygroup.spidersample/src/main/resources/log4j.properties new file mode 100644 index 0000000..19f562b --- /dev/null +++ b/org.tinygroup.spidersample/src/main/resources/log4j.properties @@ -0,0 +1,27 @@ +log4j.rootLogger=ERROR,CONSOLE,FILE + +log4j.addivity.org.apache=true + + +################### +# Console Appender +################### +log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender +log4j.appender.Threshold=ERROR +log4j.appender.CONSOLE.encoding=utf-8 +log4j.appender.CONSOLE.Target=System.out +log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout +log4j.appender.CONSOLE.layout.ConversionPattern=-%-4r [%t] %-5p %x - %m%n +#log4j.appender.CONSOLE.layout.ConversionPattern=[start]%d{DATE}[DATE]%n%p[PRIORITY]%n%x[NDC]%n%t[THREAD] n%c[CATEGORY]%n%m[MESSAGE]%n%n + + +##################### +# File Appender +##################### +log4j.appender.FILE=org.apache.log4j.FileAppender +log4j.appender.FILE.encoding=UTF-8 +log4j.appender.FILE.File=file.log +log4j.appender.FILE.Append=false +log4j.appender.FILE.layout=org.apache.log4j.PatternLayout +log4j.appender.FILE.layout.ConversionPattern=%d -%-4r [%t] %-5p %c %x - %m%n +# Use this layout for LogFactor 5 analysis \ No newline at end of file -- Gitee