From 8009fdf869f30027ffdf99b08e7d70db3d9a2a22 Mon Sep 17 00:00:00 2001 From: lebin Date: Fri, 8 Jul 2022 14:21:32 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BB=8E=20github=20=E5=90=8C=E6=AD=A50.7.5?= =?UTF-8?q?=E7=89=88=E6=9C=AC=E4=BB=A3=E7=A0=81=E5=88=B0=20gitee?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README-zh.md | 11 +- README.md | 6 +- pom.xml | 277 ++++++++++++++---- webmagic-core/pom.xml | 12 +- .../java/us/codecraft/webmagic/Request.java | 21 +- .../us/codecraft/webmagic/ResultItems.java | 2 +- .../main/java/us/codecraft/webmagic/Site.java | 13 +- .../java/us/codecraft/webmagic/Spider.java | 26 +- .../us/codecraft/webmagic/SpiderListener.java | 9 + .../downloader/HttpClientDownloader.java | 14 +- .../downloader/HttpClientGenerator.java | 100 ++++--- .../downloader/HttpUriRequestConverter.java | 2 +- .../us/codecraft/webmagic/proxy/Proxy.java | 188 ++++++++---- .../webmagic/proxy/ProxyProvider.java | 2 +- .../webmagic/proxy/SimpleProxyProvider.java | 1 + .../webmagic/selector/LinksSelector.java | 12 +- .../webmagic/selector/RegexSelector.java | 2 +- .../codecraft/webmagic/proxy/ProxyTest.java | 114 +++++-- .../webmagic/utils/NumberUtilsTest.java | 16 + webmagic-coverage/pom.xml | 72 +++++ webmagic-extension/pom.xml | 8 +- .../example/PatternProcessorExample.java | 6 +- .../webmagic/monitor/SpiderMonitor.java | 4 + .../webmagic/monitor/SpiderStatus.java | 9 +- .../scheduler/FileCacheQueueScheduler.java | 35 ++- .../scheduler/RedisPriorityScheduler.java | 95 +++--- .../webmagic/scheduler/RedisScheduler.java | 33 +-- webmagic-samples/pom.xml | 26 +- .../recover/DuplicateStorageRemover.java | 78 +++++ .../webmagic/recover/MmapQueueScheduler.java | 85 ++++++ .../webmagic/recover/RecoverSample.java | 22 ++ webmagic-saxon/pom.xml | 8 +- .../webmagic/selector/Xpath2Selector.java | 37 +-- .../webmagic/selector/XpathSelectorTest.java | 13 +- webmagic-scripts/pom.xml | 38 +-- webmagic-selenium/pom.xml | 11 +- .../selenium/SeleniumDownloader.java | 6 +- .../downloader/selenium/WebDriverPool.java | 5 +- 38 files changed, 1020 insertions(+), 399 deletions(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java create mode 100644 webmagic-coverage/pom.xml create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java mode change 100755 => 100644 webmagic-scripts/pom.xml diff --git a/README-zh.md b/README-zh.md index cd1b090..62b3c9a 100644 --- a/README-zh.md +++ b/README-zh.md @@ -1,9 +1,10 @@ ![logo](http://webmagic.io/images/logo.jpeg) +[![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/) +[![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html) [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) - 官方网站[http://webmagic.io/](http://webmagic.io/) >webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。 @@ -38,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.7.3 + 0.7.5 us.codecraft webmagic-extension - 0.7.3 + 0.7.5 ``` @@ -93,7 +94,7 @@ webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较 PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码: ```java -public class OschinaBlogPageProcesser implements PageProcessor { +public class OschinaBlogPageProcessor implements PageProcessor { private Site site = Site.me().setDomain("my.oschina.net"); @@ -113,7 +114,7 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") + Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog") .addPipeline(new ConsolePipeline()).run(); } } diff --git a/README.md b/README.md index 73cb488..14aeac7 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ [Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md) +[![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/) +[![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html) [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) >A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler. @@ -23,12 +25,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.7.3 + 0.7.5 us.codecraft webmagic-extension - 0.7.3 + 0.7.5 ``` diff --git a/pom.xml b/pom.xml index 2b2384f..dea6dea 100644 --- a/pom.xml +++ b/pom.xml @@ -1,19 +1,15 @@ - - org.sonatype.oss - oss-parent - 7 - us.codecraft - 0.7.3 + 0.7.5 4.0.0 pom UTF-8 UTF-8 + 1.8 + 1.8 4.0.0.RELEASE - webmagic-parent webmagic-parent @@ -38,8 +34,8 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - webmagic-parent-0.6.1 - + WebMagic-${project.version} + Apache License, Version 2.0 @@ -54,6 +50,7 @@ webmagic-selenium webmagic-saxon webmagic-samples + webmagic-coverage @@ -61,7 +58,7 @@ junit junit - 4.11 + 4.13.1 test @@ -73,47 +70,47 @@ org.apache.httpcomponents httpclient - 4.5.2 + 4.5.13 org.apache.httpcomponents httpcore - 4.4.4 + 4.4.14 com.google.guava guava - 15.0 + 30.1-jre com.jayway.jsonpath json-path - 2.4.0 + 2.6.0 org.slf4j slf4j-api - 1.7.6 + 1.7.30 org.slf4j slf4j-log4j12 - 1.7.6 + 1.7.30 us.codecraft xsoup - 0.3.1 + 0.3.4 com.alibaba fastjson - 1.2.28 + 1.2.83 com.github.dreamhead moco-core - 0.11.0 + 1.3.0 test @@ -130,13 +127,13 @@ org.assertj assertj-core - 1.5.0 + 3.18.1 test org.apache.commons commons-lang3 - 3.1 + 3.11 commons-collections @@ -144,43 +141,87 @@ 3.2.2 - org.apache.commons + commons-io commons-io - 1.3.2 + 2.8.0 - org.jsoup - jsoup - 1.10.3 + org.codehaus.groovy + groovy-all + 3.0.7 - org.mockito - mockito-all - 1.9.5 - test + org.jruby + jruby + 9.3.0.0 + + + org.python + jython + 2.7.2 + + + org.seleniumhq.selenium + selenium-java + 3.141.59 + + + net.sf.saxon + Saxon-HE + 10.3 + + + net.sourceforge.htmlcleaner + htmlcleaner + 2.26 + + + com.github.detro + phantomjsdriver + 1.2.0 + + + commons-cli + commons-cli + 1.4 + + + redis.clients + jedis + 3.6.0 + + org.apache.maven.plugins + maven-enforcer-plugin + 3.0.0-M3 + + + enforce-maven + + enforce + + + + + 3.3.9 + + + + + + org.apache.maven.plugins maven-surefire-plugin - 2.18 - - 0 - org.apache.maven.plugins maven-compiler-plugin - 3.1 - - 1.6 - 1.6 - UTF-8 - @@ -205,10 +246,6 @@ org.apache.maven.plugins maven-resources-plugin - 2.6 - - UTF-8 - org.apache.maven.plugins @@ -222,7 +259,7 @@ org.apache.maven.plugins maven-source-plugin - 2.2.1 + 3.2.1 attach-sources @@ -235,11 +272,15 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.10.4 + 3.2.0 UTF-8 - WebMagic 0.7.3 + WebMagic ${project.version} en_US + + + false + @@ -260,11 +301,149 @@ org.apache.maven.plugins maven-release-plugin - 2.4.1 + 3.0.0-M1 + + + org.jacoco + jacoco-maven-plugin + + + + prepare-agent + + + + report + verify + + report + + + + + + com.amashchenko.maven.plugin + gitflow-maven-plugin + + + WebMagic- + + + + + + org.apache.maven.plugins + maven-clean-plugin + 3.1.0 + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + org.apache.maven.plugins + maven-deploy-plugin + 3.0.0-M1 + + + org.apache.maven.plugins + maven-install-plugin + 3.0.0-M1 + + + org.apache.maven.plugins + maven-jar-plugin + 3.2.0 + + + org.apache.maven.plugins + maven-jxr-plugin + 3.1.1 + + + org.apache.maven.plugins + maven-pmd-plugin + 3.14.0 + + + org.apache.maven.plugins + maven-resources-plugin + 3.2.0 + + + org.apache.maven.plugins + maven-site-plugin + 3.9.1 + + + org.apache.maven.plugins + maven-surefire-plugin + 3.0.0-M5 + + + org.apache.maven.plugins + maven-surefire-report-plugin + 3.0.0-M5 + + + org.codehaus.mojo + taglist-maven-plugin + 2.4 + + + org.jacoco + jacoco-maven-plugin + 0.8.7 + + + com.amashchenko.maven.plugin + gitflow-maven-plugin + 1.15.0 + + + com.github.spotbugs + spotbugs-maven-plugin + 4.2.3 + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + none + + + + org.apache.maven.plugins + maven-jxr-plugin + + + org.apache.maven.plugins + maven-pmd-plugin + + + org.apache.maven.plugins + maven-surefire-report-plugin + + + org.codehaus.mojo + taglist-maven-plugin + + + com.github.spotbugs + spotbugs-maven-plugin + + + + release @@ -315,7 +494,7 @@ org.sonatype.plugins nexus-staging-maven-plugin - 1.6 + 1.6.8 true sonatype-nexus-staging diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index e889cd4..ec718a1 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.3 + 0.7.5 4.0.0 @@ -48,6 +48,7 @@ org.slf4j slf4j-log4j12 + true @@ -61,12 +62,7 @@ - org.jsoup - jsoup - - - - org.apache.commons + commons-io commons-io @@ -82,4 +78,4 @@ - \ No newline at end of file + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index eefd91b..9fc2861 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic; +import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.model.HttpRequestBody; import us.codecraft.webmagic.utils.Experimental; @@ -26,6 +27,11 @@ public class Request implements Serializable { private HttpRequestBody requestBody; + /** + * this req use this downloader + */ + private Downloader downloader; + /** * Store additional information in extras. */ @@ -78,14 +84,15 @@ public class Request implements Serializable { return this; } - public Object getExtra(String key) { + @SuppressWarnings("unchecked") + public T getExtra(String key) { if (extras == null) { return null; } - return extras.get(key); + return (T) extras.get(key); } - public Request putExtra(String key, Object value) { + public Request putExtra(String key, T value) { if (extras == null) { extras = new HashMap(); } @@ -174,6 +181,14 @@ public class Request implements Serializable { return binaryContent; } + public Downloader getDownloader() { + return downloader; + } + + public void setDownloader(Downloader downloader) { + this.downloader = downloader; + } + public Request setBinaryContent(boolean binaryContent) { this.binaryContent = binaryContent; return this; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index 7b54361..488c81e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; @@ -21,6 +20,7 @@ public class ResultItems { private boolean skip; + @SuppressWarnings("unchecked") public T get(String key) { Object o = fields.get(key); if (o == null) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index b6963ca..4879b28 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,8 +1,13 @@ package us.codecraft.webmagic; -import us.codecraft.webmagic.utils.HttpConstant; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import java.util.UUID; -import java.util.*; +import us.codecraft.webmagic.utils.HttpConstant; /** * Object contains setting for crawler.
@@ -203,7 +208,7 @@ public class Site { /** * Set the interval between the processing of two pages.
- * Time unit is micro seconds.
+ * Time unit is milliseconds.
* * @param sleepTime sleepTime * @return this @@ -215,7 +220,7 @@ public class Site { /** * Get the interval between the processing of two pages.
- * Time unit is micro seconds.
+ * Time unit is milliseconds.
* * @return the interval between the processing of two pages, */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 62c989f..5940e73 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -208,7 +208,8 @@ public class Spider implements Runnable, Task { * @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline) * @deprecated */ - public Spider pipeline(Pipeline pipeline) { + @Deprecated + public Spider pipeline(Pipeline pipeline) { return addPipeline(pipeline); } @@ -258,7 +259,8 @@ public class Spider implements Runnable, Task { * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) * @deprecated */ - public Spider downloader(Downloader downloader) { + @Deprecated + public Spider downloader(Downloader downloader) { return setDownloader(downloader); } @@ -320,7 +322,7 @@ public class Spider implements Runnable, Task { processRequest(request); onSuccess(request); } catch (Exception e) { - onError(request); + onError(request, e); logger.error("process request " + request + " error", e); } finally { pageCount.incrementAndGet(); @@ -338,10 +340,19 @@ public class Spider implements Runnable, Task { logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get()); } + /** + * @deprecated Use {@link #onError(Request, Exception)} instead. + */ + @Deprecated protected void onError(Request request) { + } + + protected void onError(Request request, Exception e) { + this.onError(request); + if (CollectionUtils.isNotEmpty(spiderListeners)) { for (SpiderListener spiderListener : spiderListeners) { - spiderListener.onError(request); + spiderListener.onError(request, e); } } } @@ -401,7 +412,12 @@ public class Spider implements Runnable, Task { } private void processRequest(Request request) { - Page page = downloader.download(request, this); + Page page; + if (null != request.getDownloader()){ + page = request.getDownloader().download(request,this); + }else { + page = downloader.download(request, this); + } if (page.isDownloadSuccess()){ onDownloadSuccess(request, page); } else { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java index 0678180..8f10e0e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java @@ -10,5 +10,14 @@ public interface SpiderListener { public void onSuccess(Request request); + /** + * @deprecated Use {@link #onError(Request, Exception)} instead. + */ + @Deprecated public void onError(Request request); + + default void onError(Request request, Exception e) { + this.onError(request); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 24889c8..49217e1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,5 +1,10 @@ package us.codecraft.webmagic.downloader; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.HashMap; +import java.util.Map; + import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; import org.apache.http.client.methods.CloseableHttpResponse; @@ -7,6 +12,7 @@ import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -17,12 +23,6 @@ import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpClientUtils; -import java.io.IOException; -import java.nio.charset.Charset; -import java.util.HashMap; -import java.util.Map; - - /** * The http downloader based on HttpClient. * @@ -38,7 +38,7 @@ public class HttpClientDownloader extends AbstractDownloader { private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); - + private ProxyProvider proxyProvider; private boolean responseHeader = true; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 28a16f4..80e0f10 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,5 +1,18 @@ package us.codecraft.webmagic.downloader; +import java.io.IOException; +import java.security.KeyManagementException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; +import java.security.cert.X509Certificate; +import java.util.Map; + +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; + +import org.apache.commons.lang3.JavaVersion; +import org.apache.commons.lang3.SystemUtils; import org.apache.http.HttpException; import org.apache.http.HttpRequest; import org.apache.http.HttpRequestInterceptor; @@ -11,32 +24,27 @@ import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.impl.client.*; +import org.apache.http.impl.client.BasicCookieStore; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.Site; -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; -import java.io.IOException; -import java.security.KeyManagementException; -import java.security.NoSuchAlgorithmException; -import java.security.cert.CertificateException; -import java.security.cert.X509Certificate; -import java.util.Map; +import us.codecraft.webmagic.Site; /** * @author code4crafter@gmail.com
* @since 0.4.0 */ public class HttpClientGenerator { - + private transient Logger logger = LoggerFactory.getLogger(getClass()); - + private PoolingHttpClientConnectionManager connectionManager; public HttpClientGenerator() { @@ -48,43 +56,51 @@ public class HttpClientGenerator { connectionManager.setDefaultMaxPerRoute(100); } - private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { - try { - return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}, + private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { + try { + SSLContext sslContext = createIgnoreVerifySSL(); + String[] supportedProtocols; + if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) { + supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" }; + } else { + supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" }; + } + logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols)); + return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 - } catch (KeyManagementException e) { + } catch (KeyManagementException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); } - return SSLConnectionSocketFactory.getSocketFactory(); + return SSLConnectionSocketFactory.getSocketFactory(); } - private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { - // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 - X509TrustManager trustManager = new X509TrustManager() { - - @Override - public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { - } - - @Override - public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { - } - - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - - }; - - SSLContext sc = SSLContext.getInstance("SSLv3"); - sc.init(null, new TrustManager[] { trustManager }, null); - return sc; + private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { + // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 + X509TrustManager trustManager = new X509TrustManager() { + + @Override + public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { + } + + @Override + public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { + } + + @Override + public X509Certificate[] getAcceptedIssuers() { + return null; + } + + }; + + SSLContext sc = SSLContext.getInstance("TLS"); + sc.init(null, new TrustManager[] { trustManager }, null); + return sc; } - + public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); return this; @@ -96,7 +112,7 @@ public class HttpClientGenerator { private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom(); - + httpClientBuilder.setConnectionManager(connectionManager); if (site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 28a7ce5..4baaf4a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -74,7 +74,7 @@ public class HttpUriRequestConverter { } if (proxy != null) { - requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort())); + requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme())); } requestBuilder.setConfig(requestConfigBuilder.build()); HttpUriRequest httpUriRequest = requestBuilder.build(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index c5f1007..6554fab 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -1,73 +1,135 @@ package us.codecraft.webmagic.proxy; -/** - * - */ +import java.io.UnsupportedEncodingException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; + +import org.apache.commons.lang3.StringUtils; public class Proxy { - private String host; - private int port; - private String username; - private String password; + private String scheme; + + private String host; + + private int port; + + private String username; + + private String password; - public Proxy(String host, int port) { - this.host = host; - this.port = port; - } + public static Proxy create(final URI uri) { + Proxy proxy = new Proxy(uri.getHost(), uri.getPort(), uri.getScheme()); + String userInfo = uri.getUserInfo(); + if (userInfo != null) { + String[] up = userInfo.split(":"); + if (up.length == 1) { + proxy.username = up[0].isEmpty() ? null : up[0]; + } else { + proxy.username = up[0].isEmpty() ? null : up[0]; + proxy.password = up[1].isEmpty() ? null : up[1]; + } + } + return proxy; + } - public Proxy(String host, int port, String username, String password) { - this.host = host; - this.port = port; - this.username = username; - this.password = password; - } + public Proxy(String host, int port) { + this(host, port, null); + } + + public Proxy(String host, int port, String scheme) { + this.host = host; + this.port = port; + this.scheme = scheme; + } + + public Proxy(String host, int port, String username, String password) { + this.host = host; + this.port = port; + this.username = username; + this.password = password; + } + + public String getScheme() { + return scheme; + } + + public void setScheme(String scheme) { + this.scheme = scheme; + } public String getHost() { - return host; - } - - public int getPort() { - return port; - } - - public String getUsername() { - return username; - } - - public String getPassword() { - return password; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - Proxy proxy = (Proxy) o; - - if (port != proxy.port) return false; - if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false; - if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false; - return password != null ? password.equals(proxy.password) : proxy.password == null; - } - - @Override - public int hashCode() { - int result = host != null ? host.hashCode() : 0; - result = 31 * result + port; - result = 31 * result + (username != null ? username.hashCode() : 0); - result = 31 * result + (password != null ? password.hashCode() : 0); - return result; - } - - @Override - public String toString() { - return "Proxy{" + - "host='" + host + '\'' + - ", port=" + port + - ", username='" + username + '\'' + - ", password='" + password + '\'' + - '}'; - } + return host; + } + + public int getPort() { + return port; + } + + public String getUsername() { + return username; + } + + public String getPassword() { + return password; + } + + public URI toURI() { + final StringBuilder userInfoBuffer = new StringBuilder(); + if (username != null) { + userInfoBuffer.append(urlencode(username)); + } + if (password != null) { + userInfoBuffer.append(":").append(urlencode(password)); + } + final String userInfo = StringUtils.defaultIfEmpty(userInfoBuffer.toString(), null); + URI uri; + try { + uri = new URI(scheme, userInfo, host, port, null, null, null); + } catch (URISyntaxException e) { + throw new IllegalArgumentException(e.getMessage(), e); + } + return uri; + } + + private String urlencode(String s) { + String enc = StandardCharsets.UTF_8.name(); + try { + return URLEncoder.encode(s, enc); + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException(e); + } + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Proxy proxy = (Proxy) o; + + if (port != proxy.port) return false; + if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false; + if (scheme != null ? !scheme.equals(proxy.scheme) : proxy.scheme != null) return false; + if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false; + return password != null ? password.equals(proxy.password) : proxy.password == null; + } + + @Override + public int hashCode() { + int result = host != null ? host.hashCode() : 0; + result = 31 * result + port; + result = 31 * result + (scheme != null ? scheme.hashCode() : 0); + result = 31 * result + (username != null ? username.hashCode() : 0); + result = 31 * result + (password != null ? password.hashCode() : 0); + return result; + } + + @Override + public String toString() { + return this.toURI().toString(); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index 5b61a99..0cef4ed 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -25,5 +25,5 @@ public interface ProxyProvider { * @return proxy */ Proxy getProxy(Task task); - + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index d8f47fe..ddef6a8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -59,4 +59,5 @@ public class SimpleProxyProvider implements ProxyProvider { } return p % size; } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java index 5296a74..2dafe8e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java @@ -1,12 +1,12 @@ package us.codecraft.webmagic.selector; -import org.jsoup.helper.StringUtil; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - import java.util.ArrayList; import java.util.List; +import org.apache.commons.lang3.StringUtils; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + /** * Links selector based on jsoup. Use absolute url.
* @@ -23,9 +23,9 @@ public class LinksSelector extends BaseElementSelector { @Override public List selectList(Element element) { Elements elements = element.select("a"); - List links = new ArrayList(elements.size()); + List links = new ArrayList<>(elements.size()); for (Element element0 : elements) { - if (!StringUtil.isBlank(element0.baseUri())) { + if (StringUtils.isNotBlank(element0.baseUri())) { links.add(element0.attr("abs:href")); } else { links.add(element0.attr("href")); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 9ae538c..fb0a161 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -41,7 +41,7 @@ public class RegexSelector implements Selector { /** * Create a RegexSelector. When there is no capture group, the value is set to 0 else set to 1. - * @param regexStr + * @param regexStr the regular expression. */ public RegexSelector(String regexStr) { this.compileRegex(regexStr); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index 86af367..8e4c820 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -1,45 +1,97 @@ package us.codecraft.webmagic.proxy; -import org.apache.http.HttpHost; -import org.junit.BeforeClass; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import java.net.URI; import java.util.ArrayList; import java.util.List; +import org.apache.http.HttpHost; +import org.junit.BeforeClass; +import org.junit.Test; + /** * @author yxssfxwzy@sina.com May 30, 2014 * */ public class ProxyTest { - private static List httpProxyList = new ArrayList(); - - @BeforeClass - public static void before() { - // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", - // "0.0.0.4:0" }; - String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" }; - for (String line : source) { - httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] }); - } - } - - class Fetch extends Thread { - HttpHost hp; - - public Fetch(HttpHost hp) { - this.hp = hp; - } - - @Override - public void run() { - try { - System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort()); - sleep(500); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - } + private static List httpProxyList = new ArrayList(); + + @BeforeClass + public static void before() { + // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", + // "0.0.0.4:0" }; + String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" }; + for (String line : source) { + httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] }); + } + } + + class Fetch extends Thread { + HttpHost hp; + + public Fetch(HttpHost hp) { + this.hp = hp; + } + + @Override + public void run() { + try { + System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort()); + sleep(500); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + @Test + public void testCreate() { + Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080")); + assertNull(proxy.getScheme()); + assertNull(proxy.getUsername()); + assertNull(proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + + proxy = Proxy.create(URI.create("http://127.0.0.1:8080")); + assertEquals("http", proxy.getScheme()); + assertNull(proxy.getUsername()); + assertNull(proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + + proxy = Proxy.create(URI.create("//username:password@127.0.0.1:8080")); + assertNull(proxy.getScheme()); + assertEquals("username", proxy.getUsername()); + assertEquals("password", proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + + proxy = Proxy.create(URI.create("//username@127.0.0.1:8080")); + assertNull(proxy.getScheme()); + assertEquals("username", proxy.getUsername()); + assertNull(proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + + proxy = Proxy.create(URI.create("//:password@127.0.0.1:8080")); + assertNull(proxy.getScheme()); + assertNull(proxy.getUsername()); + assertEquals("password", proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + } + + @Test + public void testToString() { + assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString()); + assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString()); + assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString()); + assertEquals("//username@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", null).toString()); + assertEquals("//:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, null, "password").toString()); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java new file mode 100644 index 0000000..f9e725e --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.utils; + +import org.junit.Assert; +import org.junit.Test; + +public class NumberUtilsTest { + + @Test + public void testCompareLong() { + Assert.assertEquals(0, NumberUtils.compareLong(0L, 0L)); + Assert.assertEquals(1, NumberUtils.compareLong(9L, 0L)); + Assert.assertEquals(-1, NumberUtils.compareLong(0L, 9L)); + Assert.assertEquals(-1, NumberUtils.compareLong(-9L, 0L)); + Assert.assertEquals(1, NumberUtils.compareLong(0L, -9L)); + } +} diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml new file mode 100644 index 0000000..16ed1b4 --- /dev/null +++ b/webmagic-coverage/pom.xml @@ -0,0 +1,72 @@ + + + 4.0.0 + + + us.codecraft + webmagic-parent + 0.7.5 + + + webmagic-coverage + pom + webmagic-coverage + Compute aggregated test code coverage + + + true + + + + + ${project.groupId} + webmagic-core + ${project.version} + + + ${project.groupId} + webmagic-extension + ${project.version} + + + ${project.groupId} + webmagic-scripts + ${project.version} + + + ${project.groupId} + webmagic-selenium + ${project.version} + + + ${project.groupId} + webmagic-saxon + ${project.version} + + + ${project.groupId} + webmagic-samples + ${project.version} + + + + + + + org.jacoco + jacoco-maven-plugin + + + + report-aggregate + + + + + + + + diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 7e949ca..85d5c63 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.3 + 0.7.5 4.0.0 @@ -13,16 +13,14 @@ redis.clients jedis - 2.9.0 com.google.guava guava - 15.0 true - us.codecraft + ${project.groupId} webmagic-core ${project.version} @@ -32,4 +30,4 @@ - \ No newline at end of file + diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java index 8ecb08f..9406abf 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java @@ -1,6 +1,8 @@ package us.codecraft.webmagic.example; -import org.apache.log4j.Logger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.*; import us.codecraft.webmagic.handler.CompositePageProcessor; import us.codecraft.webmagic.handler.CompositePipeline; @@ -15,7 +17,7 @@ import us.codecraft.webmagic.handler.RequestMatcher; */ public class PatternProcessorExample { - private static Logger log = Logger.getLogger(PatternProcessorExample.class); + private static Logger log = LoggerFactory.getLogger(PatternProcessorExample.class); public static void main(String... args) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index cfb4a82..b213dda 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -68,6 +68,10 @@ public class SpiderMonitor { return new SpiderStatus(spider, monitorSpiderListener); } + protected List getSpiderStatuses() { + return this.spiderStatuses; + } + public static SpiderMonitor instance() { return INSTANCE; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java index a87c040..69afe04 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java @@ -84,8 +84,13 @@ public class SpiderStatus implements SpiderStatusMXBean { @Override public int getPagePerSecond() { - int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000; - return getSuccessPageCount() / runSeconds; + if (getStartTime() != null) { + int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000; + if (runSeconds != 0) { + return getSuccessPageCount() / runSeconds; + } + } + return -1; } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 6ca9828..fec3c1d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -1,12 +1,13 @@ package us.codecraft.webmagic.scheduler; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.math.NumberUtils; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.scheduler.component.DuplicateRemover; - -import java.io.*; +import java.io.BufferedReader; +import java.io.Closeable; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; import java.util.LinkedHashSet; import java.util.Set; import java.util.concurrent.BlockingQueue; @@ -17,6 +18,13 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.math.NumberUtils; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + /** * Store urls and cursor in files so that a Spider can resume the status when shutdown.
@@ -141,7 +149,7 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement urls.add(line.trim()); lineReaded++; if (lineReaded > cursor.get()) { - queue.add(new Request(line)); + queue.add(deserializeRequest(line)); } } } finally { @@ -183,7 +191,7 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement init(task); } queue.add(request); - fileUrlWriter.println(request.getUrl()); + fileUrlWriter.println(serializeRequest(request)); } @Override @@ -204,4 +212,13 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement public int getTotalRequestsCount(Task task) { return getDuplicateRemover().getTotalRequestsCount(task); } + + protected String serializeRequest(Request request) { + return request.getUrl(); + } + + protected Request deserializeRequest(String line) { + return new Request(line); + } + } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java index 540574a..46d47e5 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java @@ -1,22 +1,23 @@ package us.codecraft.webmagic.scheduler; -import com.alibaba.fastjson.JSON; +import java.util.Set; + import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; + +import com.alibaba.fastjson.JSON; + import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import java.util.Set; - /** * the redis scheduler with priority * @author sai * Created by sai on 16-5-27. */ -public class RedisPriorityScheduler extends RedisScheduler -{ +public class RedisPriorityScheduler extends RedisScheduler { private static final String ZSET_PREFIX = "zset_"; @@ -37,62 +38,44 @@ public class RedisPriorityScheduler extends RedisScheduler } @Override - protected void pushWhenNoDuplicate(Request request, Task task) - { - Jedis jedis = pool.getResource(); - try - { - if(request.getPriority() > 0) + protected void pushWhenNoDuplicate(Request request, Task task) { + try (Jedis jedis = pool.getResource()) { + if (request.getPriority() > 0) { jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl()); - else if(request.getPriority() < 0) + } else if (request.getPriority() < 0) { jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl()); - else + } else { jedis.lpush(getQueueNoPriorityKey(task), request.getUrl()); + } setExtrasInItem(jedis, request, task); } - finally - { - pool.returnResource(jedis); - } } @Override - public synchronized Request poll(Task task) - { - Jedis jedis = pool.getResource(); - try - { + public synchronized Request poll(Task task) { + try (Jedis jedis = pool.getResource()) { String url = getRequest(jedis, task); - if(StringUtils.isBlank(url)) + if (StringUtils.isBlank(url)) { return null; + } return getExtrasInItem(jedis, url, task); } - finally - { - pool.returnResource(jedis); - } } - private String getRequest(Jedis jedis, Task task) - { + private String getRequest(Jedis jedis, Task task) { String url; Set urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0); - if(urls.isEmpty()) - { + if (urls.isEmpty()) { url = jedis.lpop(getQueueNoPriorityKey(task)); - if(StringUtils.isBlank(url)) - { + if (StringUtils.isBlank(url)) { urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0); - if(!urls.isEmpty()) - { + if (!urls.isEmpty()) { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetMinusPriorityKey(task), url); } } - } - else - { + } else { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetPlusPriorityKey(task), url); } @@ -100,51 +83,39 @@ public class RedisPriorityScheduler extends RedisScheduler } @Override - public void resetDuplicateCheck(Task task) - { - Jedis jedis = pool.getResource(); - try - { + public void resetDuplicateCheck(Task task) { + try (Jedis jedis = pool.getResource()) { jedis.del(getSetKey(task)); } - finally - { - pool.returnResource(jedis); - } } - private String getZsetPlusPriorityKey(Task task) - { + private String getZsetPlusPriorityKey(Task task) { return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX; } - private String getQueueNoPriorityKey(Task task) - { + private String getQueueNoPriorityKey(Task task) { return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX; } - private String getZsetMinusPriorityKey(Task task) - { + private String getZsetMinusPriorityKey(Task task) { return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX; } - private void setExtrasInItem(Jedis jedis,Request request, Task task) - { - if(request.getExtras() != null) - { - String field = DigestUtils.shaHex(request.getUrl()); + private void setExtrasInItem(Jedis jedis,Request request, Task task) { + if (request.getExtras() != null) { + String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset(getItemKey(task), field, value); } } - private Request getExtrasInItem(Jedis jedis, String url, Task task) - { + private Request getExtrasInItem(Jedis jedis, String url, Task task) { String key = getItemKey(task); - String field = DigestUtils.shaHex(url); + String field = DigestUtils.sha1Hex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); - if(bytes != null) + if (bytes != null) { return JSON.parseObject(new String(bytes), Request.class); + } return new Request(url); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index c70d885..19e8313 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -1,8 +1,10 @@ package us.codecraft.webmagic.scheduler; -import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; + +import com.alibaba.fastjson.JSON; + import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; @@ -37,21 +39,15 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor @Override public void resetDuplicateCheck(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { jedis.del(getSetKey(task)); - } finally { - pool.returnResource(jedis); } } @Override public boolean isDuplicate(Request request, Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { return jedis.sadd(getSetKey(task), request.getUrl()) == 0; - } finally { - pool.returnResource(jedis); } } @@ -62,7 +58,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor try { jedis.rpush(getQueueKey(task), request.getUrl()); if (checkForAdditionalInfo(request)) { - String field = DigestUtils.shaHex(request.getUrl()); + String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); } @@ -100,14 +96,13 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor @Override public synchronized Request poll(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { String url = jedis.lpop(getQueueKey(task)); if (url == null) { return null; } String key = ITEM_PREFIX + task.getUUID(); - String field = DigestUtils.shaHex(url); + String field = DigestUtils.sha1Hex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); if (bytes != null) { Request o = JSON.parseObject(new String(bytes), Request.class); @@ -115,8 +110,6 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor } Request request = new Request(url); return request; - } finally { - pool.returnResource(jedis); } } @@ -134,23 +127,17 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor @Override public int getLeftRequestsCount(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { Long size = jedis.llen(getQueueKey(task)); return size.intValue(); - } finally { - pool.returnResource(jedis); } } @Override public int getTotalRequestsCount(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { Long size = jedis.scard(getSetKey(task)); return size.intValue(); - } finally { - pool.returnResource(jedis); } } } diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 072bb3f..7f7ceb2 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.3 + 0.7.5 4.0.0 @@ -11,12 +11,12 @@ - us.codecraft + ${project.groupId} webmagic-core ${project.version} - us.codecraft + ${project.groupId} webmagic-extension ${project.version} @@ -24,6 +24,26 @@ junit junit + + org.mapdb + mapdb + 3.0.8 + + + com.fasterxml.jackson.core + jackson-core + 2.13.0-rc1 + + + com.fasterxml.jackson.core + jackson-annotations + 2.13.0-rc1 + + + com.fasterxml.jackson.core + jackson-databind + 2.13.2.1 + diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java new file mode 100644 index 0000000..bee80e7 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java @@ -0,0 +1,78 @@ +package us.codecraft.webmagic.recover; + +import com.google.common.base.Charsets; +import com.google.common.hash.BloomFilter; +import com.google.common.hash.Funnels; +import org.mapdb.DB; +import org.mapdb.DBMaker; +import org.mapdb.IndexTreeList; +import org.mapdb.Serializer; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +import java.util.concurrent.atomic.AtomicInteger; + +/** + * @author :linweisen + */ +public class DuplicateStorageRemover implements DuplicateRemover { + + private DB db; + + private static String DATABASE_NAME = "duplicate"; + + private IndexTreeList urlDuplicateQueue; + + private BloomFilter bloomFilter; + + private AtomicInteger counter; + + public DuplicateStorageRemover(String path) { + + String duplicatStoragePath = path; + + DB db = DBMaker.fileDB(duplicatStoragePath) + .fileMmapEnableIfSupported() + .fileMmapPreclearDisable() + .cleanerHackEnable() + .closeOnJvmShutdown() + .transactionEnable() + .concurrencyScale(128) + .make(); + this.db = db; + + this.urlDuplicateQueue = db.indexTreeList(DATABASE_NAME, Serializer.STRING).createOrOpen(); + + counter = new AtomicInteger(this.urlDuplicateQueue.size()); + this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7); + for (String url : this.urlDuplicateQueue){ + bloomFilter.put(url); + } + + } + + @Override + public boolean isDuplicate(Request request, Task task) { + String url = request.getUrl(); + boolean isDuplicate = bloomFilter.mightContain(url); + if (!isDuplicate) { + bloomFilter.put(url); + urlDuplicateQueue.add(url); + this.db.commit(); + counter.incrementAndGet(); + } + return isDuplicate; + } + + @Override + public void resetDuplicateCheck(Task task) { + this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7); + this.urlDuplicateQueue.clear(); + } + + @Override + public int getTotalRequestsCount(Task task) { + return counter.get(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java new file mode 100644 index 0000000..4cee18a --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java @@ -0,0 +1,85 @@ +package us.codecraft.webmagic.recover; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.lang3.StringUtils; +import org.mapdb.DB; +import org.mapdb.DBMaker; +import org.mapdb.IndexTreeList; +import org.mapdb.Serializer; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +import java.io.IOException; + +/** + * @author :linweisen + */ +public class MmapQueueScheduler extends DuplicateRemovedScheduler { + + private DB db; + + private static String DATABASE_NAME = "queue"; + + private IndexTreeList queue; + + private static ObjectMapper mapper; + + public MmapQueueScheduler(DuplicateRemover duplicateRemover, String path) { + super.setDuplicateRemover(duplicateRemover); + + String queuePath = path; + + DB db = DBMaker.fileDB(queuePath) + .fileMmapEnableIfSupported() + .fileMmapPreclearDisable() + .cleanerHackEnable() + .closeOnJvmShutdown() + .transactionEnable() + .concurrencyScale(128) + .make(); + this.db = db; + this.mapper = new ObjectMapper(); + this.queue = db.indexTreeList(MmapQueueScheduler.DATABASE_NAME, Serializer.STRING).createOrOpen(); + } + + @Override + public Request poll(Task task) { + if (this.queue.size() > 0){ + String s = queue.remove(0); + return fromJson(s, Request.class); + }else{ + return null; + } + + } + + @Override + public void pushWhenNoDuplicate(Request request, Task task) { + queue.add(toJson(request)); + this.db.commit(); + } + + public String toJson(Object object) { + try { + return mapper.writeValueAsString(object); + } catch (IOException e) { + logger.warn("write to json string error:" + object, e); + return null; + } + } + + public T fromJson(String jsonString, Class clazz) { + if (StringUtils.isEmpty(jsonString)) { + return null; + } + try { + return mapper.readValue(jsonString, clazz); + } catch (IOException e) { + logger.warn("parse json string error:" + jsonString, e); + return null; + } + } + +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java new file mode 100644 index 0000000..4fb91a0 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java @@ -0,0 +1,22 @@ +package us.codecraft.webmagic.recover; + + +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.samples.SinaBlogProcessor; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +/** + * @author code4crafter@gmail.com
+ */ +public class RecoverSample { + + public static void main(String[] args) { + String storage = "queue"; + String duplicate = "duplicate"; + Spider spider = new Spider(new SinaBlogProcessor()); + DuplicateRemover remover = new DuplicateStorageRemover(duplicate); + spider.setScheduler(new MmapQueueScheduler(remover, storage)); + spider.addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html") + .run(); + } +} diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 95f706e..119e50f 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.3 + 0.7.5 4.0.0 @@ -11,19 +11,17 @@ - us.codecraft + ${project.groupId} webmagic-core ${project.version} net.sourceforge.htmlcleaner htmlcleaner - 2.5 net.sf.saxon Saxon-HE - 9.5.1-1 junit @@ -34,7 +32,9 @@ + org.apache.maven.plugins maven-deploy-plugin + 3.0.0-M1 true diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 98b1efe..1f1f0a5 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -1,15 +1,11 @@ package us.codecraft.webmagic.selector; -import net.sf.saxon.lib.NamespaceConstant; -import net.sf.saxon.xpath.XPathEvaluator; -import org.apache.log4j.Logger; -import org.htmlcleaner.CleanerProperties; -import org.htmlcleaner.DomSerializer; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.TagNode; -import org.w3c.dom.Document; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; import javax.xml.transform.OutputKeys; @@ -20,12 +16,19 @@ import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; + +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import net.sf.saxon.lib.NamespaceConstant; +import net.sf.saxon.xpath.XPathEvaluator; /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
@@ -40,7 +43,7 @@ public class Xpath2Selector implements Selector { private XPathExpression xPathExpression; - private Logger logger = Logger.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); public Xpath2Selector(String xpathStr) { this.xpathStr = xpathStr; diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index aa3765a..32906b5 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.selector; +import java.util.List; + import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.htmlcleaner.XPatherException; @@ -8,6 +10,7 @@ import org.jsoup.nodes.Document; import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; + import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; @@ -1367,15 +1370,19 @@ public class XpathSelectorTest { public void testXPath2() { String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; - XpathSelector xpathSelector = new XpathSelector("//h1/text()"); - System.out.println(xpathSelector.select(text)); + Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()"); + Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text)); } @Test public void testXpath2Selector() { Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); String select = xpath2Selector.select(html); - Assert.assertNotNull(select); + Assert.assertEquals("http://www.oschina.net/", select); + + List selectList = xpath2Selector.selectList(html); + Assert.assertEquals(113, selectList.size()); + Assert.assertEquals("http://www.oschina.net/", selectList.get(0)); } @Ignore("take long time") diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml old mode 100755 new mode 100644 index 22956cb..f0c1679 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,41 +3,32 @@ webmagic-parent us.codecraft - 0.7.3 + 0.7.5 4.0.0 - us.codecraft webmagic-scripts - 1.1.2-2 + 1.6.0 org.jruby jruby - 1.7.6 org.jetbrains.kotlin kotlin-stdlib ${kotlin.version} - - org.codehaus.groovy - groovy-all - 2.1.6 - - org.python + org.python jython - 2.5.3 commons-cli commons-cli - 1.2 junit @@ -45,12 +36,16 @@ test - us.codecraft + ${project.groupId} webmagic-core ${project.version} - us.codecraft + org.slf4j + slf4j-log4j12 + + + ${project.groupId} webmagic-extension ${project.version} @@ -59,21 +54,6 @@ ${project.basedir}/src/main/java - - maven-compiler-plugin - - 1.6 - 1.6 - UTF-8 - - - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - org.apache.maven.plugins maven-jar-plugin diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 1cbf592..42a6da9 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.3 + 0.7.5 4.0.0 @@ -13,21 +13,16 @@ org.seleniumhq.selenium selenium-java - 2.41.0 - us.codecraft + ${project.groupId} webmagic-core ${project.version} com.github.detro phantomjsdriver - 1.2.0 - - - junit junit @@ -37,7 +32,9 @@ + org.apache.maven.plugins maven-deploy-plugin + 3.0.0-M1 true diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index f45f7e2..cce293f 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -1,10 +1,12 @@ package us.codecraft.webmagic.downloader.selenium; -import org.apache.log4j.Logger; import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -29,7 +31,7 @@ public class SeleniumDownloader implements Downloader, Closeable { private volatile WebDriverPool webDriverPool; - private Logger logger = Logger.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); private int sleepTime = 0; diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index 1472cb3..e1d9dd0 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.downloader.selenium; -import org.apache.log4j.Logger; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.firefox.FirefoxDriver; @@ -8,6 +7,8 @@ import org.openqa.selenium.phantomjs.PhantomJSDriver; import org.openqa.selenium.phantomjs.PhantomJSDriverService; import org.openqa.selenium.remote.DesiredCapabilities; import org.openqa.selenium.remote.RemoteWebDriver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.FileReader; import java.io.IOException; @@ -27,7 +28,7 @@ import java.util.concurrent.atomic.AtomicInteger; * Time: 下午1:41
*/ class WebDriverPool { - private Logger logger = Logger.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); private final static int DEFAULT_CAPACITY = 5; -- Gitee