diff --git a/org.tinygroup.spidersample/pom.xml b/org.tinygroup.spidersample/pom.xml index 65b356acec83868ac9af377b687e4123c622b8bb..6dd0991ad2a10cd0ec6c6d39c5732aa49871d5c7 100644 --- a/org.tinygroup.spidersample/pom.xml +++ b/org.tinygroup.spidersample/pom.xml @@ -7,12 +7,47 @@ 1.2.0-SNAPSHOT org.tinygroup.spidersample + + + + 5.1.13 + + org.tinygroup org.tinygroup.spider ${project.version} + + org.tinygroup + org.tinygroup.httpvisit + ${project.version} + + + org.tinygroup + org.tinygroup.htmlparser + ${project.version} + + + org.tinygroup + org.tinygroup.threadgroup + ${project.version} + + + + mysql + mysql-connector-java + ${mysql.driver.version} + runtime + + + + com.alibaba + druid + 1.0.12 + + - + diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/dao/Dao.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/dao/Dao.java new file mode 100644 index 0000000000000000000000000000000000000000..12a5531676120c0bcda1564096e52e045ae2159a --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/dao/Dao.java @@ -0,0 +1,79 @@ +package org.tinygroup.dao; + +import org.jsoup.Jsoup; +import org.tinygroup.entity.Category; +import org.tinygroup.entity.Joke; +import org.tinygroup.entity.Novel; +import org.tinygroup.utils.JdbcUtils; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +/** + * Created by Hulk on 2015/4/1. + */ +public class Dao { + + private JdbcUtils jdbcUtils; + + public Dao() { + jdbcUtils = new JdbcUtils(); +// jdbcUtils = JdbcUtils.getInstance(); + } + + + public Category addCategory(Category category) { + jdbcUtils.getConnection(); + if (category==null){ + return null; + } + String sql = "INSERT INTO `novel_category` (`TITLE`, `PARENT_ID`) VALUES (?,?)"; + List params = new ArrayList(); + params.add(category.getTitle()); + params.add(category.getParentId()); + try { + Integer key = jdbcUtils.insertReturnKey(sql, params); + category.setId(key); + } catch (SQLException e) { + e.printStackTrace(); + }finally { + jdbcUtils.releaseConn(); + } + return category; + } + + public Novel addNovel(Novel novel) { + jdbcUtils.getConnection(); + String sql = "INSERT INTO `novel` (`NOVEL_CATEGORY_ID`, `TITLE`, `CONTENT`) VALUES (?,?,?)"; + List params = new ArrayList(); + params.add(novel.getCategoryId()); + params.add(novel.getTitle()); + params.add(novel.getContent()); + try { + jdbcUtils.insert(sql, params); + } catch (SQLException e) { + e.printStackTrace(); + }finally { + jdbcUtils.releaseConn(); + } + return novel; + } + + public Joke addJoke(Joke joke) { + jdbcUtils.getConnection(); + String sql = "INSERT INTO `novel` (`JOKE_CATEGORY_ID`, `TITLE`, `CONTENT`) VALUES (?,?,?)"; + List params = new ArrayList(); + params.add(joke.getCategoryId()); + params.add(joke.getTitle()); + params.add(joke.getContent()); + try { + jdbcUtils.insert(sql, params); + } catch (SQLException e) { + e.printStackTrace(); + }finally { + jdbcUtils.releaseConn(); + } + return joke; + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Category.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Category.java new file mode 100644 index 0000000000000000000000000000000000000000..218297eb14fce84ff0d9886a5e9d03b937ac6742 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Category.java @@ -0,0 +1,62 @@ +package org.tinygroup.entity; + +/** + * 分类 + * Created by Hulk on 2015/4/1. + */ +public class Category { + private Integer id; + private Integer parentId; + private String title; + private String url; + private Integer count; + + public Category() { + } + + public Category(String title, String url, Integer count) { + this.title = title; + this.url = url; + this.count = count; + } + + public Integer getId() { + return id; + } + + public void setId(Integer id) { + this.id = id; + } + + public Integer getParentId() { + return parentId == null ? 0 : parentId; + } + + public void setParentId(Integer parentId) { + this.parentId = parentId; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public Integer getCount() { + return count; + } + + public void setCount(Integer count) { + this.count = count; + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Joke.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Joke.java new file mode 100644 index 0000000000000000000000000000000000000000..03adc40939a3c381b012c12abdff44c1ae0ec3e2 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Joke.java @@ -0,0 +1,44 @@ +package org.tinygroup.entity; + +/** + * 笑话 + * Created by Hulk on 2015/4/1. + */ +public class Joke { + private Integer id; + private Integer categoryId; + private String title; + private String content; + + public Integer getId() { + return id; + } + + public void setId(Integer id) { + this.id = id; + } + + public Integer getCategoryId() { + return categoryId; + } + + public void setCategoryId(Integer categoryId) { + this.categoryId = categoryId; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Novel.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Novel.java new file mode 100644 index 0000000000000000000000000000000000000000..6943b24f83b5baec95b76e32a7d0a243a496eeab --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/entity/Novel.java @@ -0,0 +1,44 @@ +package org.tinygroup.entity; + +/** + * 笑话 + * Created by Hulk on 2015/4/1. + */ +public class Novel { + private Integer id; + private Integer categoryId; + private String title; + private String content; + + public Integer getId() { + return id; + } + + public void setId(Integer id) { + this.id = id; + } + + public Integer getCategoryId() { + return categoryId; + } + + public void setCategoryId(Integer categoryId) { + this.categoryId = categoryId; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CategoryProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CategoryProcessor.java new file mode 100644 index 0000000000000000000000000000000000000000..f4ff0518ea0fcb62a2070e223989bfd4cb06d684 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CategoryProcessor.java @@ -0,0 +1,89 @@ +package org.tinygroup.joke; + +import org.tinygroup.dao.Dao; +import org.tinygroup.entity.Category; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.FastNameFilter; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.threadgroup.AbstractProcessor; +import org.tinygroup.tinyspider.Processor; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * 获取栏目分类内容 + * Created by Hulk on 2015/4/1. + */ +public class CategoryProcessor implements Processor { + private Dao dao = new Dao(); + // 线程池的容量 + private static final int POOL_SIZE = 20; + // 线程池 + ExecutorService exe = Executors.newFixedThreadPool(POOL_SIZE); + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + FastNameFilter filter = new FastNameFilter(node); + filter.setNodeName("a"); + List aList = filter.findNodeList(); +// MultiThreadProcessor processCategory = new MultiThreadProcessor("DownCategoryMultiThread"); + for (HtmlNode a : aList) { + String href = a.getAttribute("href"); + System.out.println(a.getContent() + "栏目url:" + href); + Category category = new Category(); + category.setTitle(a.getContent()); + category.setUrl(href); + dao.addCategory(category); +// processCategory.addProcessor(new DownCategoryProcessor(category)); + exe.execute(new DownCategoryProcessor(category)); + } + exe.shutdown(); +// processCategory.start(); +// processCategory.threadDone(); + } + + class DownCategoryProcessor extends AbstractProcessor implements Runnable{ + private Category category; + + public DownCategoryProcessor(Category category) { + super(category.getTitle() + "Processor"); + this.category = category; + } + + @Override + public void run() { + try { + action(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + protected void action() throws Exception { + long start = System.currentTimeMillis(); + Spider spider = new SpiderImpl("gbk"); + Watcher watcher = new WatcherImpl(); + watcher.addProcessor(new org.tinygroup.joke.PageProcessor(category)); + QuickNameFilter nodeFilter = new QuickNameFilter(); + nodeFilter.setNodeName("p"); + nodeFilter.setIncludeAttribute("id", "pages"); + watcher.setNodeFilter(nodeFilter); + spider.addWatcher(watcher); + try { + spider.processUrl(JokeMain.host + category.getUrl()); + } catch (Exception e) { + e.printStackTrace(); + } + long end = System.currentTimeMillis(); + System.out.println(category.getTitle() + "finished 耗时(ms):" + (end - start)); + + } + } + +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CcontentProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CcontentProcessor.java new file mode 100644 index 0000000000000000000000000000000000000000..8cd63eddee2021787ab6eb0bb6cbf15dec3dff63 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/CcontentProcessor.java @@ -0,0 +1,43 @@ +package org.tinygroup.joke; + +import org.tinygroup.dao.Dao; +import org.tinygroup.entity.Category; +import org.tinygroup.entity.Joke; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.FastNameFilter; +import org.tinygroup.tinyspider.Processor; + +import java.util.Map; + +/** + * 获取当前页面内容 + * Created by Hulk on 2015/4/1. + */ +public class CcontentProcessor implements Processor { + private Category category; + private Dao dao = new Dao(); + + public CcontentProcessor(Category category) { + this.category = category; + } + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + FastNameFilter filter = new FastNameFilter(node); + //获取标题 + filter.setNodeName("h3"); + HtmlNode h3 = filter.findNode(); + if (h3 == null) { + return; + } + String title = h3.getSubNode("a").getContent(); + //获取正文内容 + filter.setNodeName("div").setIncludeAttribute("id", "endtext"); + String content = filter.findNode().getBody().toString(); + Joke joke = new Joke(); + joke.setTitle(title); + joke.setCategoryId(category.getId()); + joke.setContent(content); + dao.addJoke(joke); + } + +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/JokeMain.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/JokeMain.java new file mode 100644 index 0000000000000000000000000000000000000000..8958d65af61e3f269508bfdfc0f6ccb57420960e --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/JokeMain.java @@ -0,0 +1,27 @@ +package org.tinygroup.joke; + +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +/** + * Created by Hulk on 2015/4/1. + */ +public class JokeMain { + static String host = "http://www.haha365.com"; + + public static void main(String[] args) throws Exception { + Spider spider = new SpiderImpl("gbk"); + Watcher watcher = new WatcherImpl(); + watcher.addProcessor(new CategoryProcessor()); + QuickNameFilter nodeFilter = new QuickNameFilter(); + nodeFilter.setNodeName("ul"); + nodeFilter.setIncludeAttribute("class", "cat_llb3"); + watcher.setNodeFilter(nodeFilter); + spider.addWatcher(watcher); + spider.processUrl(host + "/joke/"); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/PageProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/PageProcessor.java new file mode 100644 index 0000000000000000000000000000000000000000..efd1dba22b30442243d47945fdeb7489adf4d13a --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/joke/PageProcessor.java @@ -0,0 +1,102 @@ +package org.tinygroup.joke; + +import org.tinygroup.entity.Category; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.threadgroup.AbstractProcessor; +import org.tinygroup.threadgroup.MultiThreadProcessor; +import org.tinygroup.tinyspider.Processor; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * 获取当前分类页数 + * Created by Hulk on 2015/4/1. + */ +public class PageProcessor implements Processor { + private Category category; + // 线程池的容量 + private static final int POOL_SIZE = 3; + // 线程池 + ExecutorService exe = Executors.newFixedThreadPool(POOL_SIZE); + + public PageProcessor(Category category) { + this.category = category; + } + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + StringBuffer body = node.getBody(); + String pageCount = body.substring(body.indexOf("(共") + 2, body.indexOf("页)")); + Integer count = Integer.valueOf(pageCount); + System.out.println(category.getTitle() + "总页数===" + count); +// MultiThreadProcessor processPage = new MultiThreadProcessor("DownPageMultiThread"); + for (int i = 1; i <= count; i++) { + exe.execute(new DownPageProcessor(i)); +// processPage.addProcessor(new DownPageProcessor(i)); + } +// processPage.start(); +// processPage.threadDone(); + exe.shutdown(); + + } + +// public void getContent(Integer count, Category category) throws Exception { +// Spider spider = new SpiderImpl("gbk"); +// Watcher watcher = new WatcherImpl(); +// watcher.addProcessor(new CcontentProcessor(category)); +// QuickNameFilter nodeFilter = new QuickNameFilter(); +// nodeFilter.setNodeName("div"); +// nodeFilter.setIncludeAttribute("class", "r_c"); +// watcher.setNodeFilter(nodeFilter); +// spider.addWatcher(watcher); +// MultiThreadProcessor processPage = new MultiThreadProcessor("DownPageMultiThread"); +// +// for (int i = 1; i <= count; i++) { +// processPage.addProcessor(new DownPageProcessor()); +// spider.processUrl(JokeMain.host + category.getUrl() + "index_" + i + ".htm"); +// System.out.println(category.getUrl() + "当页面完成page=" + i); +// } +// } + + + class DownPageProcessor extends AbstractProcessor implements Runnable { + private int pageNo; + + public DownPageProcessor(int pageNo) { + super(category.getTitle() + " pageNo" + pageNo + " Processor"); + this.pageNo = pageNo; + } + + public void run() { + try { + action(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + protected void action() throws Exception { +// Thread.sleep(10000); + long start = System.currentTimeMillis(); + Spider spider = new SpiderImpl("gbk"); + Watcher watcher = new WatcherImpl(); + watcher.addProcessor(new CcontentProcessor(category)); + QuickNameFilter nodeFilter = new QuickNameFilter(); + nodeFilter.setNodeName("div"); + nodeFilter.setIncludeAttribute("class", "r_c"); + watcher.setNodeFilter(nodeFilter); + spider.addWatcher(watcher); + spider.processUrl(JokeMain.host + category.getUrl() + "index_" + pageNo + ".htm"); + long end = System.currentTimeMillis(); + System.out.println("\t\t" + category.getTitle() + "finished pageNO=" + pageNo + "耗时(ms):" + (end - start)); + } + } + + +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/BodyProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/BodyProcessor.java new file mode 100644 index 0000000000000000000000000000000000000000..7e3f45b4897fcd9ab0187bfebed270661511d718 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/BodyProcessor.java @@ -0,0 +1,35 @@ +package org.tinygroup.novel1; + +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.FastNameFilter; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.tinyspider.Processor; + +import java.util.List; +import java.util.Map; + +/** + * Created by Hulk on 2015/4/2. + */ +public class BodyProcessor implements Processor { + private String body; + + public String getBody() { + return body; + } + + public void setBody(String body) { + this.body = body; + } + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + FastNameFilter filter = new FastNameFilter(node); + filter.setNodeName("p"); + HtmlNode p = filter.findNode(); + if (p == null) { + return; + } + setBody(p.toString()); +// System.out.println(body); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ContentProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ContentProcessor.java new file mode 100644 index 0000000000000000000000000000000000000000..ef43923246cf595fd4d194fbec3ba80160e310a4 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ContentProcessor.java @@ -0,0 +1,71 @@ +package org.tinygroup.novel1; + +import org.tinygroup.dao.Dao; +import org.tinygroup.entity.Category; +import org.tinygroup.entity.Novel; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +/** + * 获取当前页面内容 + * Created by Hulk on 2015/4/1. + */ +public class ContentProcessor extends Thread { + private Category category; + private String url; + private Dao dao = new Dao(); + + public ContentProcessor(Category category, String url) { + this.category = category; + this.url = url; + } + + @Override + public void run() { + super.run(); + long start = System.currentTimeMillis(); + Spider spider = new SpiderImpl("gbk"); + //获取标题 + QuickNameFilter titleNodeFilter = new QuickNameFilter(); + titleNodeFilter.setNodeName("div"); + titleNodeFilter.setIncludeAttribute("class", "title"); + TitleProcessor titleProcessor= new TitleProcessor(); + + Watcher titleWatcher = new WatcherImpl() + .processor(titleProcessor) + .nodeFilter(titleNodeFilter); + spider.addWatcher(titleWatcher); + + //获取正文 + BodyProcessor bodyProcessor = new BodyProcessor(); + QuickNameFilter bodyNodeFilter = new QuickNameFilter(); + bodyNodeFilter.setNodeName("div"); + bodyNodeFilter.setIncludeAttribute("class", "content"); + + Watcher bodyWatcher = new WatcherImpl() + .processor(bodyProcessor) + .nodeFilter(bodyNodeFilter); + spider.addWatcher(bodyWatcher); + + try { + spider.processUrl(url); + Novel novel = new Novel(); + novel.setCategoryId(category.getId()); + if (titleProcessor.getTitle()==null || bodyProcessor.getBody()==null){ + return; + } + novel.setContent(bodyProcessor.getBody()); + novel.setTitle(titleProcessor.getTitle()); + dao.addNovel(novel); + long end = System.currentTimeMillis(); + System.out.println("\t\t"+category.getTitle() + "title=" + novel.getTitle() + " get content finished 耗时(ms):" + (end - start)); + } catch (Exception e) { + e.printStackTrace(); + } + + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ItemListProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ItemListProcessor.java new file mode 100644 index 0000000000000000000000000000000000000000..0d68760c587451a14889b89a293b21edf9a28ec3 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/ItemListProcessor.java @@ -0,0 +1,76 @@ +package org.tinygroup.novel1; + +import org.tinygroup.entity.Category; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.FastNameFilter; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.tinyspider.Processor; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Created by Hulk on 2015/4/2. + */ +public class ItemListProcessor extends Thread implements Processor { + private Category category; + private int pageNo; + // 线程池的容量 + private static final int POOL_SIZE = 10; + // 线程池 + ExecutorService exe = Executors.newFixedThreadPool(POOL_SIZE); + + public ItemListProcessor(Category category, int pageNo) { + if (category==null){ + return; + } + this.category = category; + this.pageNo = pageNo; + } + + @Override + public void run() { + long start = System.currentTimeMillis(); + Spider spider = new SpiderImpl("gbk"); + Watcher watcher = new WatcherImpl(); + watcher.addProcessor(this); + QuickNameFilter nodeFilter = new QuickNameFilter(); + nodeFilter.setNodeName("ul"); + nodeFilter.setIncludeAttribute("class", "e2"); + watcher.setNodeFilter(nodeFilter); + spider.addWatcher(watcher); + try { + spider.processUrl(NovelMain.host + category.getUrl() + pageNo + ".html"); + } catch (Exception e) { + e.printStackTrace(); + } + long end = System.currentTimeMillis(); + System.out.println(category.getTitle() + "pageNo=" + pageNo + " get items finished 耗时(ms):" + (end - start)); + } + + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + FastNameFilter filter = new FastNameFilter(node); + //获取标题 + filter.setNodeName("h3").setIncludeAttribute("class", "title"); + List h3List = filter.findNodeList(); + for (HtmlNode htmlNode : h3List) { + HtmlNode a = htmlNode.getSubNode("a"); + if (a == null) { + return; + } + String contentLink = a.getAttribute("href"); + if (category==null){ + return; + } + exe.execute(new ContentProcessor(category, contentLink)); + } + exe.shutdown(); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/NovelMain.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/NovelMain.java new file mode 100644 index 0000000000000000000000000000000000000000..f527d5204fa9b1d5ffced35914b571d5b99af08c --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/NovelMain.java @@ -0,0 +1,39 @@ +package org.tinygroup.novel1; + +import org.tinygroup.dao.Dao; +import org.tinygroup.entity.Category; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Created by Hulk on 2015/4/1. + */ +public class NovelMain { + static String host = "http://www.jj59.com/xiaoshuo"; + static Dao dao = new Dao(); + // 线程池的容量 + static final int POOL_SIZE = 5; + // 线程池 + static ExecutorService exe = Executors.newFixedThreadPool(POOL_SIZE); + + public static void main(String[] args) throws Exception { + + List list = new ArrayList(); + list.add(new Category("弹指江湖", "/qingchunxiaoyuan/list_107_", 42)); + list.add(new Category("爱情小说", "/aiqingxiaoshuo/list_105_", 100)); + list.add(new Category("故事新编", "/gushixinbian/list_106_", 21)); + list.add(new Category("青春校园 ", "/qingchunxiaoyuan/list_107_", 68)); + list.add(new Category("百味人生", "/baiweirensheng/list_108_", 79)); + list.add(new Category("都市言情 ", "/dushiyanqing/list_123_", 4)); + for (Category category : list) { + dao.addCategory(category); + for (int i = 1; i <= category.getCount(); i++) { + exe.execute(new ItemListProcessor(category,i)); + } + } + exe.shutdown(); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/TitleProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/TitleProcessor.java new file mode 100644 index 0000000000000000000000000000000000000000..788592090b5e93c35e09947c30f8fa68a1bcb4e8 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel1/TitleProcessor.java @@ -0,0 +1,29 @@ +package org.tinygroup.novel1; + +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.tinyspider.Processor; + +import java.util.Map; + +/** + * Created by Hulk on 2015/4/2. + */ +public class TitleProcessor implements Processor { + private String title; + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + HtmlNode h2 = node.getSubNode("h2"); + if (h2 == null) { + return; + } + setTitle(h2.getBody().toString()); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/BodyProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/BodyProcessor.java new file mode 100644 index 0000000000000000000000000000000000000000..5d859a5a65fb128c7e1590e7abeb135daefe6af9 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/BodyProcessor.java @@ -0,0 +1,34 @@ +package org.tinygroup.novel2; + +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.FastNameFilter; +import org.tinygroup.tinyspider.Processor; + +import java.util.Map; + +/** + * Created by Hulk on 2015/4/2. + */ +public class BodyProcessor implements Processor { + private String body; + + public String getBody() { + return body; + } + + public void setBody(String body) { + this.body = body; + } + + public void process(String url, HtmlNode node, Map parameters) throws Exception { +// FastNameFilter filter = new FastNameFilter(node); +// filter.setNodeName("p"); +// HtmlNode p = filter.findNode(); +// if (p == null) { +// return; +// } + String body = node.getBody().toString(); + setBody(body); +// System.out.println(body); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ContentProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ContentProcessor.java new file mode 100644 index 0000000000000000000000000000000000000000..8f59c5dc9c7d2b34569a481c46ffcbed18112df7 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ContentProcessor.java @@ -0,0 +1,71 @@ +package org.tinygroup.novel2; + +import org.tinygroup.dao.Dao; +import org.tinygroup.entity.Category; +import org.tinygroup.entity.Novel; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +/** + * 获取当前页面内容 + * Created by Hulk on 2015/4/1. + */ +public class ContentProcessor extends Thread { + private Category category; + private String url; + private Dao dao = new Dao(); + + public ContentProcessor(Category category, String url) { + this.category = category; + this.url = url; + } + + @Override + public void run() { + super.run(); + long start = System.currentTimeMillis(); + Spider spider = new SpiderImpl("gbk"); + //获取标题 + QuickNameFilter titleNodeFilter = new QuickNameFilter(); + titleNodeFilter.setNodeName("td"); + titleNodeFilter.setIncludeAttribute("class", "fox2008_cntitle"); + TitleProcessor titleProcessor= new TitleProcessor(); + + Watcher titleWatcher = new WatcherImpl() + .processor(titleProcessor) + .nodeFilter(titleNodeFilter); + spider.addWatcher(titleWatcher); + + //获取正文 + BodyProcessor bodyProcessor = new BodyProcessor(); + QuickNameFilter bodyNodeFilter = new QuickNameFilter(); + bodyNodeFilter.setNodeName("td"); + bodyNodeFilter.setIncludeAttribute("class", "fox2008_cnpicontent"); + + Watcher bodyWatcher = new WatcherImpl() + .processor(bodyProcessor) + .nodeFilter(bodyNodeFilter); + spider.addWatcher(bodyWatcher); + + try { + spider.processUrl(url); + Novel novel = new Novel(); + novel.setCategoryId(category.getId()); + if (titleProcessor.getTitle()==null || bodyProcessor.getBody()==null){ + return; + } + novel.setContent(bodyProcessor.getBody()); + novel.setTitle(titleProcessor.getTitle()); + dao.addNovel(novel); + long end = System.currentTimeMillis(); + System.out.println("\t\t"+category.getTitle() + "title=" + novel.getTitle() + " get content finished 耗时(ms):" + (end - start)); + } catch (Exception e) { + e.printStackTrace(); + } + + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ItemListProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ItemListProcessor.java new file mode 100644 index 0000000000000000000000000000000000000000..bbe4a327e6bd72f4b76081b8732f34f24f48321d --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/ItemListProcessor.java @@ -0,0 +1,69 @@ +package org.tinygroup.novel2; + +import org.tinygroup.entity.Category; +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.parser.filter.FastNameFilter; +import org.tinygroup.parser.filter.QuickNameFilter; +import org.tinygroup.tinyspider.Processor; +import org.tinygroup.tinyspider.Spider; +import org.tinygroup.tinyspider.Watcher; +import org.tinygroup.tinyspider.impl.SpiderImpl; +import org.tinygroup.tinyspider.impl.WatcherImpl; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Created by Hulk on 2015/4/2. + */ +public class ItemListProcessor extends Thread implements Processor { + private Category category; + private int pageNo; + // 线程池的容量 + private static final int POOL_SIZE = 15; + // 线程池 + ExecutorService exe = Executors.newFixedThreadPool(POOL_SIZE); + + public ItemListProcessor(Category category, int pageNo) { + if (category==null){ + return; + } + this.category = category; + this.pageNo = pageNo; + } + + @Override + public void run() { + long start = System.currentTimeMillis(); + Spider spider = new SpiderImpl("gbk"); + Watcher watcher = new WatcherImpl(); + watcher.addProcessor(this); + QuickNameFilter nodeFilter = new QuickNameFilter(); + nodeFilter.setNodeName("td"); + nodeFilter.setIncludeAttribute("class", "listdlmid"); + watcher.setNodeFilter(nodeFilter); + spider.addWatcher(watcher); + try { + spider.processUrl(NovelMain.host + category.getUrl() + pageNo + ".html"); + } catch (Exception e) { + e.printStackTrace(); + } + long end = System.currentTimeMillis(); + System.out.println(category.getTitle() + "pageNo=" + pageNo + " get items finished 耗时(ms):" + (end - start)); + } + + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + FastNameFilter filter = new FastNameFilter(node); + //获取标题 + filter.setNodeName("a").setIncludeAttribute("class", ""); + List aList = filter.findNodeList(); + for (HtmlNode a : aList) { + String contentLink = a.getAttribute("href"); + exe.execute(new ContentProcessor(category, NovelMain.host+contentLink)); + } + exe.shutdown(); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/NovelMain.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/NovelMain.java new file mode 100644 index 0000000000000000000000000000000000000000..40f5aebc19a698c1188da3c29fa57345c95de9d2 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/NovelMain.java @@ -0,0 +1,35 @@ +package org.tinygroup.novel2; + +import org.tinygroup.dao.Dao; +import org.tinygroup.entity.Category; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Created by Hulk on 2015/4/1. + */ +public class NovelMain { + static String host = "http://www.fox2008.cn"; + static Dao dao = new Dao(); + // 线程池的容量 + static final int POOL_SIZE = 5; + // 线程池 + static ExecutorService exe = Executors.newFixedThreadPool(POOL_SIZE); + + public static void main(String[] args) throws Exception { + + List list = new ArrayList(); + list.add(new Category("小小说精选", "/Article/ShowClass.asp?ClassID=1072&page=", 24)); + list.add(new Category("中外微型小说", "/Article/ShowClass.asp?ClassID=1297&page=", 7)); + for (Category category : list) { + dao.addCategory(category); + for (int i = 1; i <= category.getCount(); i++) { + exe.execute(new ItemListProcessor(category,i)); + } + } + exe.shutdown(); + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/TitleProcessor.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/TitleProcessor.java new file mode 100644 index 0000000000000000000000000000000000000000..924d0d8839c4445d5083b05d891ef7d1b877d835 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/novel2/TitleProcessor.java @@ -0,0 +1,33 @@ +package org.tinygroup.novel2; + +import org.tinygroup.htmlparser.node.HtmlNode; +import org.tinygroup.tinyspider.Processor; + +import java.util.Map; + +/** + * Created by Hulk on 2015/4/2. + */ +public class TitleProcessor implements Processor { + private String title; + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public void process(String url, HtmlNode node, Map parameters) throws Exception { + String title = node.getPureText(); + if (title == null) { + return; + } + if (title.indexOf("(")>0){ + setTitle(title.substring(0, title.indexOf("("))); + }else { + setTitle(title); + } + } +} diff --git a/org.tinygroup.spidersample/src/main/java/org/tinygroup/utils/JdbcUtils.java b/org.tinygroup.spidersample/src/main/java/org/tinygroup/utils/JdbcUtils.java new file mode 100644 index 0000000000000000000000000000000000000000..3571ee62ad2cab8751e72eddf61fd20faf5d5c82 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/java/org/tinygroup/utils/JdbcUtils.java @@ -0,0 +1,260 @@ +package org.tinygroup.utils; + +import com.alibaba.druid.pool.DruidDataSource; + +import java.lang.reflect.Field; +import java.sql.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Created by Hulk on 2015/4/1. + */ +public class JdbcUtils { + private static JdbcUtils ins; + // 表示定义数据库的用户名 + private final static String USERNAME = "root"; + // 定义数据库的密码 + private final static String PASSWORD = "hdu123"; + // 定义数据库的驱动信息 + private final static String DRIVER = "com.mysql.jdbc.Driver"; + // 定义访问数据库的地址 + private final static String URL = "jdbc:mysql://localhost:3306/tiny?characterEncoding=UTF-8"; + // 定义数据库的链接 + private Connection connection; + // 定义sql语句的执行对象 + private PreparedStatement pstmt; + // 定义查询返回的结果集合 + private ResultSet resultSet; + // 实现批处理操作的功能 + private Statement stmt; + + private static DruidDataSource dataSource; + + static { + dataSource = new DruidDataSource(); + dataSource.setDriverClassName(DRIVER); + dataSource.setUrl(URL); + dataSource.setPoolPreparedStatements(true); + dataSource.setUsername(USERNAME); + dataSource.setPassword(PASSWORD); + } + +// public JdbcUtils() { +// try { +//// Class.forName(DRIVER); +// System.out.println("注册驱动成功!!"); +// } catch (Exception e) { +// e.printStackTrace(); +// } +// } + + public static JdbcUtils getInstance() { + if (ins == null) { + ins = new JdbcUtils(); + ins.getConnection(); + } + return ins; + } + + + // 定义获得数据库的链接 + public Connection getConnection() { + try { + connection = dataSource.getConnection(); + } catch (Exception e) { + e.printStackTrace(); + } + return connection; + } + + public Integer insert(String sql, List params) throws SQLException { + pstmt = connection.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS); + int index = 1; + if (params != null && !params.isEmpty()) { + for (int i = 0; i < params.size(); i++) { + pstmt.setObject(index++, params.get(i)); + } + } + return pstmt.executeUpdate(); + } + + public Integer insertReturnKey(String sql, List params) throws SQLException { + pstmt = connection.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS); + int index = 1; + if (params != null && !params.isEmpty()) { + for (int i = 0; i < params.size(); i++) { + pstmt.setObject(index++, params.get(i)); + } + } + int result = pstmt.executeUpdate(); + resultSet = pstmt.getGeneratedKeys();// 返回主键 + Integer key = null; + if (result != 0 && resultSet.next()) { + key = resultSet.getInt(result); + } + return key; + } + + /** + * 完成对数据库的表的添加删除和修改的操作 + * + * @param sql + * @param params + * @return + * @throws SQLException + */ + public boolean updateByPreparedStatement(String sql, List params) + throws SQLException { + boolean flag = false; + int result = -1;// 表示当用户执行添加删除和修改的时候所影响数据库的行数 + pstmt = connection.prepareStatement(sql); + int index = 1; + if (params != null && !params.isEmpty()) { + for (int i = 0; i < params.size(); i++) { + pstmt.setObject(index++, params.get(i)); + } + } + result = pstmt.executeUpdate(); + flag = result > 0 ? true : false; + return flag; + } + + /** + * 查询返回单条记录 + * + * @param sql + * @param params + * @return + * @throws SQLException + */ + public Map findSimpleResult(String sql, List params) + throws SQLException { + Map map = new HashMap(); + int index = 1; + pstmt = connection.prepareStatement(sql); + if (params != null && !params.isEmpty()) { + for (int i = 0; i < params.size(); i++) { + pstmt.setObject(index++, params.get(i)); + } + } + resultSet = pstmt.executeQuery();// 返回查询结果 + ResultSetMetaData metaData = resultSet.getMetaData(); + int col_len = metaData.getColumnCount();// 获得列的名称 + while (resultSet.next()) { + for (int i = 0; i < col_len; i++) { + String cols_name = metaData.getColumnName(i + 1); + Object cols_value = resultSet.getObject(cols_name); + if (cols_value == null) { + cols_value = ""; + } + map.put(cols_name, cols_value); + } + } + return map; + } + + // jdbc的封装可以用反射机制来封装 + public T findSimpleRefResult(String sql, List params, + Class cls) throws Exception { + T resultObject = null; + int index = 1; + pstmt = connection.prepareStatement(sql); + if (params != null && !params.isEmpty()) { + for (int i = 0; i < params.size(); i++) { + pstmt.setObject(index++, params.get(i)); + } + } + resultSet = pstmt.executeQuery(); + ResultSetMetaData metaData = resultSet.getMetaData(); + int cols_len = metaData.getColumnCount(); + while (resultSet.next()) { + //通过反射机制创建实例 + resultObject = cls.newInstance(); + for (int i = 0; i < cols_len; i++) { + String cols_name = metaData.getColumnName(i + 1); + Object cols_value = resultSet.getObject(cols_name); + if (cols_value == null) { + cols_value = ""; + } + Field field = cls.getDeclaredField(cols_name); + field.setAccessible(true);// 打开javabean的访问private权限 + field.set(resultObject, cols_value); + } + } + return resultObject; + } + + /** + * 通过反射机制访问数据库 + * + * @param + * @param sql + * @param params + * @param cls + * @return + * @throws Exception + */ + public List findMoreRefResult(String sql, List params, + Class cls) throws Exception { + List list = new ArrayList(); + int index = 1; + pstmt = connection.prepareStatement(sql); + if (params != null && !params.isEmpty()) { + for (int i = 0; i < params.size(); i++) { + pstmt.setObject(index++, params.get(i)); + } + } + resultSet = pstmt.executeQuery(); + ResultSetMetaData metaData = resultSet.getMetaData(); + int cols_len = metaData.getColumnCount(); + while (resultSet.next()) { + T resultObject = cls.newInstance(); + for (int i = 0; i < cols_len; i++) { + String cols_name = metaData.getColumnName(i + 1); + Object cols_value = resultSet.getObject(cols_name); + if (cols_value == null) { + cols_value = ""; + } + Field field = cls.getDeclaredField(cols_name); + field.setAccessible(true); + field.set(resultObject, cols_value); + } + list.add(resultObject); + } + return list; + } + + public void releaseConn() { + if (resultSet != null) { + try { + resultSet.close(); + } catch (SQLException e) { + e.printStackTrace(); + } + } + if (stmt != null) { + try { + stmt.close(); + } catch (SQLException e) { + e.printStackTrace(); + } + } + if (pstmt != null) { + try { + pstmt.close(); + } catch (SQLException e) { + e.printStackTrace(); + } + } + if (connection != null) { + try { + connection.close(); + } catch (SQLException e) { + e.printStackTrace(); + } + } + } +} diff --git a/org.tinygroup.spidersample/src/main/resources/log4j.properties b/org.tinygroup.spidersample/src/main/resources/log4j.properties new file mode 100644 index 0000000000000000000000000000000000000000..19f562b9b93b968d017e748da73e1db6c3557fa8 --- /dev/null +++ b/org.tinygroup.spidersample/src/main/resources/log4j.properties @@ -0,0 +1,27 @@ +log4j.rootLogger=ERROR,CONSOLE,FILE + +log4j.addivity.org.apache=true + + +################### +# Console Appender +################### +log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender +log4j.appender.Threshold=ERROR +log4j.appender.CONSOLE.encoding=utf-8 +log4j.appender.CONSOLE.Target=System.out +log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout +log4j.appender.CONSOLE.layout.ConversionPattern=-%-4r [%t] %-5p %x - %m%n +#log4j.appender.CONSOLE.layout.ConversionPattern=[start]%d{DATE}[DATE]%n%p[PRIORITY]%n%x[NDC]%n%t[THREAD] n%c[CATEGORY]%n%m[MESSAGE]%n%n + + +##################### +# File Appender +##################### +log4j.appender.FILE=org.apache.log4j.FileAppender +log4j.appender.FILE.encoding=UTF-8 +log4j.appender.FILE.File=file.log +log4j.appender.FILE.Append=false +log4j.appender.FILE.layout=org.apache.log4j.PatternLayout +log4j.appender.FILE.layout.ConversionPattern=%d -%-4r [%t] %-5p %c %x - %m%n +# Use this layout for LogFactor 5 analysis \ No newline at end of file