diff --git a/README-zh.md b/README-zh.md
index cd1b090c73dc42fa6f676cb5fdddb70df04487b2..62b3c9a5edd1ceb0f15d66dad6671faf51c2b91c 100644
--- a/README-zh.md
+++ b/README-zh.md
@@ -1,9 +1,10 @@

+[](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
+[](https://www.apache.org/licenses/LICENSE-2.0.html)
[](https://travis-ci.org/code4craft/webmagic)
-
官方网站[http://webmagic.io/](http://webmagic.io/)
>webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。
@@ -38,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
us.codecraft
webmagic-core
- 0.7.3
+ 0.7.5
us.codecraft
webmagic-extension
- 0.7.3
+ 0.7.5
```
@@ -93,7 +94,7 @@ webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较
PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码:
```java
-public class OschinaBlogPageProcesser implements PageProcessor {
+public class OschinaBlogPageProcessor implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net");
@@ -113,7 +114,7 @@ public class OschinaBlogPageProcesser implements PageProcessor {
}
public static void main(String[] args) {
- Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog")
+ Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog")
.addPipeline(new ConsolePipeline()).run();
}
}
diff --git a/README.md b/README.md
index 73cb48833bf10506414b63a31d24efff00626c46..14aeac7b19824bec94bb174d865aaf5440f6bb8b 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,8 @@
[Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md)
+[](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
+[](https://www.apache.org/licenses/LICENSE-2.0.html)
[](https://travis-ci.org/code4craft/webmagic)
>A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.
@@ -23,12 +25,12 @@ Add dependencies to your pom.xml:
us.codecraft
webmagic-core
- 0.7.3
+ 0.7.5
us.codecraft
webmagic-extension
- 0.7.3
+ 0.7.5
```
diff --git a/pom.xml b/pom.xml
index 2b2384fd827a7b2de547ae81e95df0ba901db2e4..dea6deac290c9893333cb59de9a9cf2243e1c657 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,19 +1,15 @@
-
- org.sonatype.oss
- oss-parent
- 7
-
us.codecraft
- 0.7.3
+ 0.7.5
4.0.0
pom
UTF-8
UTF-8
+ 1.8
+ 1.8
4.0.0.RELEASE
-
webmagic-parent
webmagic-parent
@@ -38,8 +34,8 @@
scm:git:git@github.com:code4craft/webmagic.git
scm:git:git@github.com:code4craft/webmagic.git
git@github.com:code4craft/webmagic.git
- webmagic-parent-0.6.1
-
+ WebMagic-${project.version}
+
Apache License, Version 2.0
@@ -54,6 +50,7 @@
webmagic-selenium
webmagic-saxon
webmagic-samples
+ webmagic-coverage
@@ -61,7 +58,7 @@
junit
junit
- 4.11
+ 4.13.1
test
@@ -73,47 +70,47 @@
org.apache.httpcomponents
httpclient
- 4.5.2
+ 4.5.13
org.apache.httpcomponents
httpcore
- 4.4.4
+ 4.4.14
com.google.guava
guava
- 15.0
+ 30.1-jre
com.jayway.jsonpath
json-path
- 2.4.0
+ 2.6.0
org.slf4j
slf4j-api
- 1.7.6
+ 1.7.30
org.slf4j
slf4j-log4j12
- 1.7.6
+ 1.7.30
us.codecraft
xsoup
- 0.3.1
+ 0.3.4
com.alibaba
fastjson
- 1.2.28
+ 1.2.83
com.github.dreamhead
moco-core
- 0.11.0
+ 1.3.0
test
@@ -130,13 +127,13 @@
org.assertj
assertj-core
- 1.5.0
+ 3.18.1
test
org.apache.commons
commons-lang3
- 3.1
+ 3.11
commons-collections
@@ -144,43 +141,87 @@
3.2.2
- org.apache.commons
+ commons-io
commons-io
- 1.3.2
+ 2.8.0
- org.jsoup
- jsoup
- 1.10.3
+ org.codehaus.groovy
+ groovy-all
+ 3.0.7
- org.mockito
- mockito-all
- 1.9.5
- test
+ org.jruby
+ jruby
+ 9.3.0.0
+
+
+ org.python
+ jython
+ 2.7.2
+
+
+ org.seleniumhq.selenium
+ selenium-java
+ 3.141.59
+
+
+ net.sf.saxon
+ Saxon-HE
+ 10.3
+
+
+ net.sourceforge.htmlcleaner
+ htmlcleaner
+ 2.26
+
+
+ com.github.detro
+ phantomjsdriver
+ 1.2.0
+
+
+ commons-cli
+ commons-cli
+ 1.4
+
+
+ redis.clients
+ jedis
+ 3.6.0
+
+ org.apache.maven.plugins
+ maven-enforcer-plugin
+ 3.0.0-M3
+
+
+ enforce-maven
+
+ enforce
+
+
+
+
+ 3.3.9
+
+
+
+
+
+
org.apache.maven.plugins
maven-surefire-plugin
- 2.18
-
- 0
-
org.apache.maven.plugins
maven-compiler-plugin
- 3.1
-
- 1.6
- 1.6
- UTF-8
-
@@ -205,10 +246,6 @@
org.apache.maven.plugins
maven-resources-plugin
- 2.6
-
- UTF-8
-
org.apache.maven.plugins
@@ -222,7 +259,7 @@
org.apache.maven.plugins
maven-source-plugin
- 2.2.1
+ 3.2.1
attach-sources
@@ -235,11 +272,15 @@
org.apache.maven.plugins
maven-javadoc-plugin
- 2.10.4
+ 3.2.0
UTF-8
- WebMagic 0.7.3
+ WebMagic ${project.version}
en_US
+
+
+ false
+
@@ -260,11 +301,149 @@
org.apache.maven.plugins
maven-release-plugin
- 2.4.1
+ 3.0.0-M1
+
+
+ org.jacoco
+ jacoco-maven-plugin
+
+
+
+ prepare-agent
+
+
+
+ report
+ verify
+
+ report
+
+
+
+
+
+ com.amashchenko.maven.plugin
+ gitflow-maven-plugin
+
+
+ WebMagic-
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-clean-plugin
+ 3.1.0
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.8.1
+
+
+ org.apache.maven.plugins
+ maven-deploy-plugin
+ 3.0.0-M1
+
+
+ org.apache.maven.plugins
+ maven-install-plugin
+ 3.0.0-M1
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+ 3.2.0
+
+
+ org.apache.maven.plugins
+ maven-jxr-plugin
+ 3.1.1
+
+
+ org.apache.maven.plugins
+ maven-pmd-plugin
+ 3.14.0
+
+
+ org.apache.maven.plugins
+ maven-resources-plugin
+ 3.2.0
+
+
+ org.apache.maven.plugins
+ maven-site-plugin
+ 3.9.1
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.0.0-M5
+
+
+ org.apache.maven.plugins
+ maven-surefire-report-plugin
+ 3.0.0-M5
+
+
+ org.codehaus.mojo
+ taglist-maven-plugin
+ 2.4
+
+
+ org.jacoco
+ jacoco-maven-plugin
+ 0.8.7
+
+
+ com.amashchenko.maven.plugin
+ gitflow-maven-plugin
+ 1.15.0
+
+
+ com.github.spotbugs
+ spotbugs-maven-plugin
+ 4.2.3
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+
+ none
+
+
+
+ org.apache.maven.plugins
+ maven-jxr-plugin
+
+
+ org.apache.maven.plugins
+ maven-pmd-plugin
+
+
+ org.apache.maven.plugins
+ maven-surefire-report-plugin
+
+
+ org.codehaus.mojo
+ taglist-maven-plugin
+
+
+ com.github.spotbugs
+ spotbugs-maven-plugin
+
+
+
+
release
@@ -315,7 +494,7 @@
org.sonatype.plugins
nexus-staging-maven-plugin
- 1.6
+ 1.6.8
true
sonatype-nexus-staging
diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index e889cd491b6daa97c94e08b7238e540c7a69cd02..ec718a1e3e80a5cebe583625d4a13e6b6b3c77fe 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -3,7 +3,7 @@
us.codecraft
webmagic-parent
- 0.7.3
+ 0.7.5
4.0.0
@@ -48,6 +48,7 @@
org.slf4j
slf4j-log4j12
+ true
@@ -61,12 +62,7 @@
- org.jsoup
- jsoup
-
-
-
- org.apache.commons
+ commons-io
commons-io
@@ -82,4 +78,4 @@
-
\ No newline at end of file
+
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
index eefd91bb521fb15507856132a6897554da0f302a..9fc2861923669695b16b869979ee937f21d1ea2d 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
@@ -1,5 +1,6 @@
package us.codecraft.webmagic;
+import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.utils.Experimental;
@@ -26,6 +27,11 @@ public class Request implements Serializable {
private HttpRequestBody requestBody;
+ /**
+ * this req use this downloader
+ */
+ private Downloader downloader;
+
/**
* Store additional information in extras.
*/
@@ -78,14 +84,15 @@ public class Request implements Serializable {
return this;
}
- public Object getExtra(String key) {
+ @SuppressWarnings("unchecked")
+ public T getExtra(String key) {
if (extras == null) {
return null;
}
- return extras.get(key);
+ return (T) extras.get(key);
}
- public Request putExtra(String key, Object value) {
+ public Request putExtra(String key, T value) {
if (extras == null) {
extras = new HashMap();
}
@@ -174,6 +181,14 @@ public class Request implements Serializable {
return binaryContent;
}
+ public Downloader getDownloader() {
+ return downloader;
+ }
+
+ public void setDownloader(Downloader downloader) {
+ this.downloader = downloader;
+ }
+
public Request setBinaryContent(boolean binaryContent) {
this.binaryContent = binaryContent;
return this;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
index 7b543613d7ac786972829eb65e776c891dd9f441..488c81e77946e92929953fb77296778053581824 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
@@ -1,6 +1,5 @@
package us.codecraft.webmagic;
-import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
@@ -21,6 +20,7 @@ public class ResultItems {
private boolean skip;
+ @SuppressWarnings("unchecked")
public T get(String key) {
Object o = fields.get(key);
if (o == null) {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
index b6963ca43c7e4774da6577d9c46c230703eb33d2..4879b2825148fe87c346430da861f7059f80d510 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
@@ -1,8 +1,13 @@
package us.codecraft.webmagic;
-import us.codecraft.webmagic.utils.HttpConstant;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
-import java.util.*;
+import us.codecraft.webmagic.utils.HttpConstant;
/**
* Object contains setting for crawler.
@@ -203,7 +208,7 @@ public class Site {
/**
* Set the interval between the processing of two pages.
- * Time unit is micro seconds.
+ * Time unit is milliseconds.
*
* @param sleepTime sleepTime
* @return this
@@ -215,7 +220,7 @@ public class Site {
/**
* Get the interval between the processing of two pages.
- * Time unit is micro seconds.
+ * Time unit is milliseconds.
*
* @return the interval between the processing of two pages,
*/
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index 62c989f1d3479eea3ac636ba05acd0576fc21dad..5940e738db9bfe0dae27954306beacddae967f6e 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -208,7 +208,8 @@ public class Spider implements Runnable, Task {
* @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
* @deprecated
*/
- public Spider pipeline(Pipeline pipeline) {
+ @Deprecated
+ public Spider pipeline(Pipeline pipeline) {
return addPipeline(pipeline);
}
@@ -258,7 +259,8 @@ public class Spider implements Runnable, Task {
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
* @deprecated
*/
- public Spider downloader(Downloader downloader) {
+ @Deprecated
+ public Spider downloader(Downloader downloader) {
return setDownloader(downloader);
}
@@ -320,7 +322,7 @@ public class Spider implements Runnable, Task {
processRequest(request);
onSuccess(request);
} catch (Exception e) {
- onError(request);
+ onError(request, e);
logger.error("process request " + request + " error", e);
} finally {
pageCount.incrementAndGet();
@@ -338,10 +340,19 @@ public class Spider implements Runnable, Task {
logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get());
}
+ /**
+ * @deprecated Use {@link #onError(Request, Exception)} instead.
+ */
+ @Deprecated
protected void onError(Request request) {
+ }
+
+ protected void onError(Request request, Exception e) {
+ this.onError(request);
+
if (CollectionUtils.isNotEmpty(spiderListeners)) {
for (SpiderListener spiderListener : spiderListeners) {
- spiderListener.onError(request);
+ spiderListener.onError(request, e);
}
}
}
@@ -401,7 +412,12 @@ public class Spider implements Runnable, Task {
}
private void processRequest(Request request) {
- Page page = downloader.download(request, this);
+ Page page;
+ if (null != request.getDownloader()){
+ page = request.getDownloader().download(request,this);
+ }else {
+ page = downloader.download(request, this);
+ }
if (page.isDownloadSuccess()){
onDownloadSuccess(request, page);
} else {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java
index 0678180385f7289167fb9f6393c6c0010fb85520..8f10e0ef0973d7a383a7094c956e0aaad5d2e9bb 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java
@@ -10,5 +10,14 @@ public interface SpiderListener {
public void onSuccess(Request request);
+ /**
+ * @deprecated Use {@link #onError(Request, Exception)} instead.
+ */
+ @Deprecated
public void onError(Request request);
+
+ default void onError(Request request, Exception e) {
+ this.onError(request);
+ }
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index 24889c88b22b51b236b31f10667c74bff913aaff..49217e111744d52ffc9fbb850a82f335f87f00c0 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -1,5 +1,10 @@
package us.codecraft.webmagic.downloader;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Map;
+
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
@@ -7,6 +12,7 @@ import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
@@ -17,12 +23,6 @@ import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpClientUtils;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.HashMap;
-import java.util.Map;
-
-
/**
* The http downloader based on HttpClient.
*
@@ -38,7 +38,7 @@ public class HttpClientDownloader extends AbstractDownloader {
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
-
+
private ProxyProvider proxyProvider;
private boolean responseHeader = true;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
index 28a16f41d5eaf8101a9ec463b8d86938e305da12..80e0f1085f36d5d77756f3bfeb34ed9e66ad50ba 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
@@ -1,5 +1,18 @@
package us.codecraft.webmagic.downloader;
+import java.io.IOException;
+import java.security.KeyManagementException;
+import java.security.NoSuchAlgorithmException;
+import java.security.cert.CertificateException;
+import java.security.cert.X509Certificate;
+import java.util.Map;
+
+import javax.net.ssl.SSLContext;
+import javax.net.ssl.TrustManager;
+import javax.net.ssl.X509TrustManager;
+
+import org.apache.commons.lang3.JavaVersion;
+import org.apache.commons.lang3.SystemUtils;
import org.apache.http.HttpException;
import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor;
@@ -11,32 +24,27 @@ import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
-import org.apache.http.impl.client.*;
+import org.apache.http.impl.client.BasicCookieStore;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import us.codecraft.webmagic.Site;
-import javax.net.ssl.SSLContext;
-import javax.net.ssl.TrustManager;
-import javax.net.ssl.X509TrustManager;
-import java.io.IOException;
-import java.security.KeyManagementException;
-import java.security.NoSuchAlgorithmException;
-import java.security.cert.CertificateException;
-import java.security.cert.X509Certificate;
-import java.util.Map;
+import us.codecraft.webmagic.Site;
/**
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public class HttpClientGenerator {
-
+
private transient Logger logger = LoggerFactory.getLogger(getClass());
-
+
private PoolingHttpClientConnectionManager connectionManager;
public HttpClientGenerator() {
@@ -48,43 +56,51 @@ public class HttpClientGenerator {
connectionManager.setDefaultMaxPerRoute(100);
}
- private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
- try {
- return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"},
+ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
+ try {
+ SSLContext sslContext = createIgnoreVerifySSL();
+ String[] supportedProtocols;
+ if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) {
+ supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" };
+ } else {
+ supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" };
+ }
+ logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols));
+ return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
null,
new DefaultHostnameVerifier()); // 优先绕过安全证书
- } catch (KeyManagementException e) {
+ } catch (KeyManagementException e) {
logger.error("ssl connection fail", e);
} catch (NoSuchAlgorithmException e) {
logger.error("ssl connection fail", e);
}
- return SSLConnectionSocketFactory.getSocketFactory();
+ return SSLConnectionSocketFactory.getSocketFactory();
}
- private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
- // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
- X509TrustManager trustManager = new X509TrustManager() {
-
- @Override
- public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
- }
-
- @Override
- public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
- }
-
- @Override
- public X509Certificate[] getAcceptedIssuers() {
- return null;
- }
-
- };
-
- SSLContext sc = SSLContext.getInstance("SSLv3");
- sc.init(null, new TrustManager[] { trustManager }, null);
- return sc;
+ private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
+ // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
+ X509TrustManager trustManager = new X509TrustManager() {
+
+ @Override
+ public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
+ }
+
+ @Override
+ public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
+ }
+
+ @Override
+ public X509Certificate[] getAcceptedIssuers() {
+ return null;
+ }
+
+ };
+
+ SSLContext sc = SSLContext.getInstance("TLS");
+ sc.init(null, new TrustManager[] { trustManager }, null);
+ return sc;
}
-
+
public HttpClientGenerator setPoolSize(int poolSize) {
connectionManager.setMaxTotal(poolSize);
return this;
@@ -96,7 +112,7 @@ public class HttpClientGenerator {
private CloseableHttpClient generateClient(Site site) {
HttpClientBuilder httpClientBuilder = HttpClients.custom();
-
+
httpClientBuilder.setConnectionManager(connectionManager);
if (site.getUserAgent() != null) {
httpClientBuilder.setUserAgent(site.getUserAgent());
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
index 28a7ce5ea22c9b8827a8c77a3dc0438963c1a612..4baaf4a4a899ce25689efc0f86e478afdcfadccb 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
@@ -74,7 +74,7 @@ public class HttpUriRequestConverter {
}
if (proxy != null) {
- requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort()));
+ requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme()));
}
requestBuilder.setConfig(requestConfigBuilder.build());
HttpUriRequest httpUriRequest = requestBuilder.build();
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
index c5f100732c03346e7f39490e4aa4b33b2926be42..6554fab51336df0dee914b57c1014ec97147e7c2 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
@@ -1,73 +1,135 @@
package us.codecraft.webmagic.proxy;
-/**
- *
- */
+import java.io.UnsupportedEncodingException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.commons.lang3.StringUtils;
public class Proxy {
- private String host;
- private int port;
- private String username;
- private String password;
+ private String scheme;
+
+ private String host;
+
+ private int port;
+
+ private String username;
+
+ private String password;
- public Proxy(String host, int port) {
- this.host = host;
- this.port = port;
- }
+ public static Proxy create(final URI uri) {
+ Proxy proxy = new Proxy(uri.getHost(), uri.getPort(), uri.getScheme());
+ String userInfo = uri.getUserInfo();
+ if (userInfo != null) {
+ String[] up = userInfo.split(":");
+ if (up.length == 1) {
+ proxy.username = up[0].isEmpty() ? null : up[0];
+ } else {
+ proxy.username = up[0].isEmpty() ? null : up[0];
+ proxy.password = up[1].isEmpty() ? null : up[1];
+ }
+ }
+ return proxy;
+ }
- public Proxy(String host, int port, String username, String password) {
- this.host = host;
- this.port = port;
- this.username = username;
- this.password = password;
- }
+ public Proxy(String host, int port) {
+ this(host, port, null);
+ }
+
+ public Proxy(String host, int port, String scheme) {
+ this.host = host;
+ this.port = port;
+ this.scheme = scheme;
+ }
+
+ public Proxy(String host, int port, String username, String password) {
+ this.host = host;
+ this.port = port;
+ this.username = username;
+ this.password = password;
+ }
+
+ public String getScheme() {
+ return scheme;
+ }
+
+ public void setScheme(String scheme) {
+ this.scheme = scheme;
+ }
public String getHost() {
- return host;
- }
-
- public int getPort() {
- return port;
- }
-
- public String getUsername() {
- return username;
- }
-
- public String getPassword() {
- return password;
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
-
- Proxy proxy = (Proxy) o;
-
- if (port != proxy.port) return false;
- if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false;
- if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false;
- return password != null ? password.equals(proxy.password) : proxy.password == null;
- }
-
- @Override
- public int hashCode() {
- int result = host != null ? host.hashCode() : 0;
- result = 31 * result + port;
- result = 31 * result + (username != null ? username.hashCode() : 0);
- result = 31 * result + (password != null ? password.hashCode() : 0);
- return result;
- }
-
- @Override
- public String toString() {
- return "Proxy{" +
- "host='" + host + '\'' +
- ", port=" + port +
- ", username='" + username + '\'' +
- ", password='" + password + '\'' +
- '}';
- }
+ return host;
+ }
+
+ public int getPort() {
+ return port;
+ }
+
+ public String getUsername() {
+ return username;
+ }
+
+ public String getPassword() {
+ return password;
+ }
+
+ public URI toURI() {
+ final StringBuilder userInfoBuffer = new StringBuilder();
+ if (username != null) {
+ userInfoBuffer.append(urlencode(username));
+ }
+ if (password != null) {
+ userInfoBuffer.append(":").append(urlencode(password));
+ }
+ final String userInfo = StringUtils.defaultIfEmpty(userInfoBuffer.toString(), null);
+ URI uri;
+ try {
+ uri = new URI(scheme, userInfo, host, port, null, null, null);
+ } catch (URISyntaxException e) {
+ throw new IllegalArgumentException(e.getMessage(), e);
+ }
+ return uri;
+ }
+
+ private String urlencode(String s) {
+ String enc = StandardCharsets.UTF_8.name();
+ try {
+ return URLEncoder.encode(s, enc);
+ } catch (UnsupportedEncodingException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ Proxy proxy = (Proxy) o;
+
+ if (port != proxy.port) return false;
+ if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false;
+ if (scheme != null ? !scheme.equals(proxy.scheme) : proxy.scheme != null) return false;
+ if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false;
+ return password != null ? password.equals(proxy.password) : proxy.password == null;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = host != null ? host.hashCode() : 0;
+ result = 31 * result + port;
+ result = 31 * result + (scheme != null ? scheme.hashCode() : 0);
+ result = 31 * result + (username != null ? username.hashCode() : 0);
+ result = 31 * result + (password != null ? password.hashCode() : 0);
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return this.toURI().toString();
+ }
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
index 5b61a993ac0b533ed0cc3d51f6d0fa2a6f349726..0cef4ed4259b3e5305a47d38aeecd6ec923ff9c4 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
@@ -25,5 +25,5 @@ public interface ProxyProvider {
* @return proxy
*/
Proxy getProxy(Task task);
-
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
index d8f47fe44bd506c1482a18580fc64ed2051c212c..ddef6a88c64d23b4c403f6873fe2c38c603ac983 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
@@ -59,4 +59,5 @@ public class SimpleProxyProvider implements ProxyProvider {
}
return p % size;
}
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
index 5296a74bdaec2e3fbf6087ddf8b85328d6191fe8..2dafe8ee92df02c28c91c195f80fce120f32e0c4 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
@@ -1,12 +1,12 @@
package us.codecraft.webmagic.selector;
-import org.jsoup.helper.StringUtil;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
-
import java.util.ArrayList;
import java.util.List;
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
/**
* Links selector based on jsoup. Use absolute url.
*
@@ -23,9 +23,9 @@ public class LinksSelector extends BaseElementSelector {
@Override
public List selectList(Element element) {
Elements elements = element.select("a");
- List links = new ArrayList(elements.size());
+ List links = new ArrayList<>(elements.size());
for (Element element0 : elements) {
- if (!StringUtil.isBlank(element0.baseUri())) {
+ if (StringUtils.isNotBlank(element0.baseUri())) {
links.add(element0.attr("abs:href"));
} else {
links.add(element0.attr("href"));
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
index 9ae538c0ff42788df0eac8f62c97421f32a8604c..fb0a161d23ed72fecd4885a478bc2ec1e275902e 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
@@ -41,7 +41,7 @@ public class RegexSelector implements Selector {
/**
* Create a RegexSelector. When there is no capture group, the value is set to 0 else set to 1.
- * @param regexStr
+ * @param regexStr the regular expression.
*/
public RegexSelector(String regexStr) {
this.compileRegex(regexStr);
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
index 86af36720c64a17fbbec19e45c5c5b1418751f8b..8e4c82026fac8ff7693d4e63b0481d8e114b3a33 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
@@ -1,45 +1,97 @@
package us.codecraft.webmagic.proxy;
-import org.apache.http.HttpHost;
-import org.junit.BeforeClass;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import java.net.URI;
import java.util.ArrayList;
import java.util.List;
+import org.apache.http.HttpHost;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
/**
* @author yxssfxwzy@sina.com May 30, 2014
*
*/
public class ProxyTest {
- private static List httpProxyList = new ArrayList();
-
- @BeforeClass
- public static void before() {
- // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
- // "0.0.0.4:0" };
- String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" };
- for (String line : source) {
- httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] });
- }
- }
-
- class Fetch extends Thread {
- HttpHost hp;
-
- public Fetch(HttpHost hp) {
- this.hp = hp;
- }
-
- @Override
- public void run() {
- try {
- System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort());
- sleep(500);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- }
+ private static List httpProxyList = new ArrayList();
+
+ @BeforeClass
+ public static void before() {
+ // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
+ // "0.0.0.4:0" };
+ String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" };
+ for (String line : source) {
+ httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] });
+ }
+ }
+
+ class Fetch extends Thread {
+ HttpHost hp;
+
+ public Fetch(HttpHost hp) {
+ this.hp = hp;
+ }
+
+ @Override
+ public void run() {
+ try {
+ System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort());
+ sleep(500);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ @Test
+ public void testCreate() {
+ Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080"));
+ assertNull(proxy.getScheme());
+ assertNull(proxy.getUsername());
+ assertNull(proxy.getPassword());
+ assertEquals("127.0.0.1", proxy.getHost());
+ assertEquals(8080, proxy.getPort());
+
+ proxy = Proxy.create(URI.create("http://127.0.0.1:8080"));
+ assertEquals("http", proxy.getScheme());
+ assertNull(proxy.getUsername());
+ assertNull(proxy.getPassword());
+ assertEquals("127.0.0.1", proxy.getHost());
+ assertEquals(8080, proxy.getPort());
+
+ proxy = Proxy.create(URI.create("//username:password@127.0.0.1:8080"));
+ assertNull(proxy.getScheme());
+ assertEquals("username", proxy.getUsername());
+ assertEquals("password", proxy.getPassword());
+ assertEquals("127.0.0.1", proxy.getHost());
+ assertEquals(8080, proxy.getPort());
+
+ proxy = Proxy.create(URI.create("//username@127.0.0.1:8080"));
+ assertNull(proxy.getScheme());
+ assertEquals("username", proxy.getUsername());
+ assertNull(proxy.getPassword());
+ assertEquals("127.0.0.1", proxy.getHost());
+ assertEquals(8080, proxy.getPort());
+
+ proxy = Proxy.create(URI.create("//:password@127.0.0.1:8080"));
+ assertNull(proxy.getScheme());
+ assertNull(proxy.getUsername());
+ assertEquals("password", proxy.getPassword());
+ assertEquals("127.0.0.1", proxy.getHost());
+ assertEquals(8080, proxy.getPort());
+ }
+
+ @Test
+ public void testToString() {
+ assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString());
+ assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString());
+ assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString());
+ assertEquals("//username@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", null).toString());
+ assertEquals("//:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, null, "password").toString());
+ }
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..f9e725e29cd4b2b61e79482a8ab6fd2564f8f73b
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java
@@ -0,0 +1,16 @@
+package us.codecraft.webmagic.utils;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class NumberUtilsTest {
+
+ @Test
+ public void testCompareLong() {
+ Assert.assertEquals(0, NumberUtils.compareLong(0L, 0L));
+ Assert.assertEquals(1, NumberUtils.compareLong(9L, 0L));
+ Assert.assertEquals(-1, NumberUtils.compareLong(0L, 9L));
+ Assert.assertEquals(-1, NumberUtils.compareLong(-9L, 0L));
+ Assert.assertEquals(1, NumberUtils.compareLong(0L, -9L));
+ }
+}
diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml
new file mode 100644
index 0000000000000000000000000000000000000000..16ed1b456412621f457b8171193acb5bfd883685
--- /dev/null
+++ b/webmagic-coverage/pom.xml
@@ -0,0 +1,72 @@
+
+
+ 4.0.0
+
+
+ us.codecraft
+ webmagic-parent
+ 0.7.5
+
+
+ webmagic-coverage
+ pom
+ webmagic-coverage
+ Compute aggregated test code coverage
+
+
+ true
+
+
+
+
+ ${project.groupId}
+ webmagic-core
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-extension
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-scripts
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-selenium
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-saxon
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-samples
+ ${project.version}
+
+
+
+
+
+
+ org.jacoco
+ jacoco-maven-plugin
+
+
+
+ report-aggregate
+
+
+
+
+
+
+
+
diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml
index 7e949ca6f9fb43866b32da1705e835529e8339e0..85d5c639408304f7c1d51acd0c339d928ec89101 100644
--- a/webmagic-extension/pom.xml
+++ b/webmagic-extension/pom.xml
@@ -3,7 +3,7 @@
us.codecraft
webmagic-parent
- 0.7.3
+ 0.7.5
4.0.0
@@ -13,16 +13,14 @@
redis.clients
jedis
- 2.9.0
com.google.guava
guava
- 15.0
true
- us.codecraft
+ ${project.groupId}
webmagic-core
${project.version}
@@ -32,4 +30,4 @@
-
\ No newline at end of file
+
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java
index 8ecb08fe62a5de50bbf823bc9e73b4091c5218f9..9406abfd2a613f73ef998606b5f1d46da1fe0c9c 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java
@@ -1,6 +1,8 @@
package us.codecraft.webmagic.example;
-import org.apache.log4j.Logger;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.handler.CompositePageProcessor;
import us.codecraft.webmagic.handler.CompositePipeline;
@@ -15,7 +17,7 @@ import us.codecraft.webmagic.handler.RequestMatcher;
*/
public class PatternProcessorExample {
- private static Logger log = Logger.getLogger(PatternProcessorExample.class);
+ private static Logger log = LoggerFactory.getLogger(PatternProcessorExample.class);
public static void main(String... args) {
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
index cfb4a8200031e0daf9b2af31b0985adc0be6dfaf..b213dda94d0c47b16b79ef945a046ab9f5648f8b 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
@@ -68,6 +68,10 @@ public class SpiderMonitor {
return new SpiderStatus(spider, monitorSpiderListener);
}
+ protected List getSpiderStatuses() {
+ return this.spiderStatuses;
+ }
+
public static SpiderMonitor instance() {
return INSTANCE;
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java
index a87c040bd1faf3f5b493d241c72604655d24f9c7..69afe042a24ff073f6b8589f80aab79e51d20f32 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java
@@ -84,8 +84,13 @@ public class SpiderStatus implements SpiderStatusMXBean {
@Override
public int getPagePerSecond() {
- int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000;
- return getSuccessPageCount() / runSeconds;
+ if (getStartTime() != null) {
+ int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000;
+ if (runSeconds != 0) {
+ return getSuccessPageCount() / runSeconds;
+ }
+ }
+ return -1;
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
index 6ca9828538baf4bc034db92c47c649459c5c6f47..fec3c1db9a84dd1cf9c6dacd96602a5702ec0613 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
@@ -1,12 +1,13 @@
package us.codecraft.webmagic.scheduler;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.math.NumberUtils;
-import us.codecraft.webmagic.Request;
-import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
-
-import java.io.*;
+import java.io.BufferedReader;
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
@@ -17,6 +18,13 @@ import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.math.NumberUtils;
+
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
/**
* Store urls and cursor in files so that a Spider can resume the status when shutdown.
@@ -141,7 +149,7 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
urls.add(line.trim());
lineReaded++;
if (lineReaded > cursor.get()) {
- queue.add(new Request(line));
+ queue.add(deserializeRequest(line));
}
}
} finally {
@@ -183,7 +191,7 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
init(task);
}
queue.add(request);
- fileUrlWriter.println(request.getUrl());
+ fileUrlWriter.println(serializeRequest(request));
}
@Override
@@ -204,4 +212,13 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
public int getTotalRequestsCount(Task task) {
return getDuplicateRemover().getTotalRequestsCount(task);
}
+
+ protected String serializeRequest(Request request) {
+ return request.getUrl();
+ }
+
+ protected Request deserializeRequest(String line) {
+ return new Request(line);
+ }
+
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java
index 540574ad25f4038d7a1b08972733261c1e58ce82..46d47e5a5b29837368cd1e518473810ad2610bb0 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java
@@ -1,22 +1,23 @@
package us.codecraft.webmagic.scheduler;
-import com.alibaba.fastjson.JSON;
+import java.util.Set;
+
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
+
+import com.alibaba.fastjson.JSON;
+
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
-import java.util.Set;
-
/**
* the redis scheduler with priority
* @author sai
* Created by sai on 16-5-27.
*/
-public class RedisPriorityScheduler extends RedisScheduler
-{
+public class RedisPriorityScheduler extends RedisScheduler {
private static final String ZSET_PREFIX = "zset_";
@@ -37,62 +38,44 @@ public class RedisPriorityScheduler extends RedisScheduler
}
@Override
- protected void pushWhenNoDuplicate(Request request, Task task)
- {
- Jedis jedis = pool.getResource();
- try
- {
- if(request.getPriority() > 0)
+ protected void pushWhenNoDuplicate(Request request, Task task) {
+ try (Jedis jedis = pool.getResource()) {
+ if (request.getPriority() > 0) {
jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl());
- else if(request.getPriority() < 0)
+ } else if (request.getPriority() < 0) {
jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl());
- else
+ } else {
jedis.lpush(getQueueNoPriorityKey(task), request.getUrl());
+ }
setExtrasInItem(jedis, request, task);
}
- finally
- {
- pool.returnResource(jedis);
- }
}
@Override
- public synchronized Request poll(Task task)
- {
- Jedis jedis = pool.getResource();
- try
- {
+ public synchronized Request poll(Task task) {
+ try (Jedis jedis = pool.getResource()) {
String url = getRequest(jedis, task);
- if(StringUtils.isBlank(url))
+ if (StringUtils.isBlank(url)) {
return null;
+ }
return getExtrasInItem(jedis, url, task);
}
- finally
- {
- pool.returnResource(jedis);
- }
}
- private String getRequest(Jedis jedis, Task task)
- {
+ private String getRequest(Jedis jedis, Task task) {
String url;
Set urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0);
- if(urls.isEmpty())
- {
+ if (urls.isEmpty()) {
url = jedis.lpop(getQueueNoPriorityKey(task));
- if(StringUtils.isBlank(url))
- {
+ if (StringUtils.isBlank(url)) {
urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0);
- if(!urls.isEmpty())
- {
+ if (!urls.isEmpty()) {
url = urls.toArray(new String[0])[0];
jedis.zrem(getZsetMinusPriorityKey(task), url);
}
}
- }
- else
- {
+ } else {
url = urls.toArray(new String[0])[0];
jedis.zrem(getZsetPlusPriorityKey(task), url);
}
@@ -100,51 +83,39 @@ public class RedisPriorityScheduler extends RedisScheduler
}
@Override
- public void resetDuplicateCheck(Task task)
- {
- Jedis jedis = pool.getResource();
- try
- {
+ public void resetDuplicateCheck(Task task) {
+ try (Jedis jedis = pool.getResource()) {
jedis.del(getSetKey(task));
}
- finally
- {
- pool.returnResource(jedis);
- }
}
- private String getZsetPlusPriorityKey(Task task)
- {
+ private String getZsetPlusPriorityKey(Task task) {
return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX;
}
- private String getQueueNoPriorityKey(Task task)
- {
+ private String getQueueNoPriorityKey(Task task) {
return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX;
}
- private String getZsetMinusPriorityKey(Task task)
- {
+ private String getZsetMinusPriorityKey(Task task) {
return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX;
}
- private void setExtrasInItem(Jedis jedis,Request request, Task task)
- {
- if(request.getExtras() != null)
- {
- String field = DigestUtils.shaHex(request.getUrl());
+ private void setExtrasInItem(Jedis jedis,Request request, Task task) {
+ if (request.getExtras() != null) {
+ String field = DigestUtils.sha1Hex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset(getItemKey(task), field, value);
}
}
- private Request getExtrasInItem(Jedis jedis, String url, Task task)
- {
+ private Request getExtrasInItem(Jedis jedis, String url, Task task) {
String key = getItemKey(task);
- String field = DigestUtils.shaHex(url);
+ String field = DigestUtils.sha1Hex(url);
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
- if(bytes != null)
+ if (bytes != null) {
return JSON.parseObject(new String(bytes), Request.class);
+ }
return new Request(url);
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
index c70d88507dc2e0849505496fdda578dc6fe2201f..19e831321ced7aace5be2b206a321845ec85528e 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
@@ -1,8 +1,10 @@
package us.codecraft.webmagic.scheduler;
-import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
+
+import com.alibaba.fastjson.JSON;
+
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
@@ -37,21 +39,15 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
@Override
public void resetDuplicateCheck(Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
jedis.del(getSetKey(task));
- } finally {
- pool.returnResource(jedis);
}
}
@Override
public boolean isDuplicate(Request request, Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
return jedis.sadd(getSetKey(task), request.getUrl()) == 0;
- } finally {
- pool.returnResource(jedis);
}
}
@@ -62,7 +58,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
try {
jedis.rpush(getQueueKey(task), request.getUrl());
if (checkForAdditionalInfo(request)) {
- String field = DigestUtils.shaHex(request.getUrl());
+ String field = DigestUtils.sha1Hex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
}
@@ -100,14 +96,13 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
@Override
public synchronized Request poll(Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
String url = jedis.lpop(getQueueKey(task));
if (url == null) {
return null;
}
String key = ITEM_PREFIX + task.getUUID();
- String field = DigestUtils.shaHex(url);
+ String field = DigestUtils.sha1Hex(url);
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
if (bytes != null) {
Request o = JSON.parseObject(new String(bytes), Request.class);
@@ -115,8 +110,6 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
}
Request request = new Request(url);
return request;
- } finally {
- pool.returnResource(jedis);
}
}
@@ -134,23 +127,17 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
@Override
public int getLeftRequestsCount(Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
Long size = jedis.llen(getQueueKey(task));
return size.intValue();
- } finally {
- pool.returnResource(jedis);
}
}
@Override
public int getTotalRequestsCount(Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
Long size = jedis.scard(getSetKey(task));
return size.intValue();
- } finally {
- pool.returnResource(jedis);
}
}
}
diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml
index 072bb3fd5d1002bd908683a293ef453c48410149..7f7ceb22845a6724f54fd6b1018c954a0f7b213c 100644
--- a/webmagic-samples/pom.xml
+++ b/webmagic-samples/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.7.3
+ 0.7.5
4.0.0
@@ -11,12 +11,12 @@
- us.codecraft
+ ${project.groupId}
webmagic-core
${project.version}
- us.codecraft
+ ${project.groupId}
webmagic-extension
${project.version}
@@ -24,6 +24,26 @@
junit
junit
+
+ org.mapdb
+ mapdb
+ 3.0.8
+
+
+ com.fasterxml.jackson.core
+ jackson-core
+ 2.13.0-rc1
+
+
+ com.fasterxml.jackson.core
+ jackson-annotations
+ 2.13.0-rc1
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ 2.13.2.1
+
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java
new file mode 100644
index 0000000000000000000000000000000000000000..bee80e775cd7716b18d41b9158be9fe8185ccac1
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java
@@ -0,0 +1,78 @@
+package us.codecraft.webmagic.recover;
+
+import com.google.common.base.Charsets;
+import com.google.common.hash.BloomFilter;
+import com.google.common.hash.Funnels;
+import org.mapdb.DB;
+import org.mapdb.DBMaker;
+import org.mapdb.IndexTreeList;
+import org.mapdb.Serializer;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * @author :linweisen
+ */
+public class DuplicateStorageRemover implements DuplicateRemover {
+
+ private DB db;
+
+ private static String DATABASE_NAME = "duplicate";
+
+ private IndexTreeList urlDuplicateQueue;
+
+ private BloomFilter bloomFilter;
+
+ private AtomicInteger counter;
+
+ public DuplicateStorageRemover(String path) {
+
+ String duplicatStoragePath = path;
+
+ DB db = DBMaker.fileDB(duplicatStoragePath)
+ .fileMmapEnableIfSupported()
+ .fileMmapPreclearDisable()
+ .cleanerHackEnable()
+ .closeOnJvmShutdown()
+ .transactionEnable()
+ .concurrencyScale(128)
+ .make();
+ this.db = db;
+
+ this.urlDuplicateQueue = db.indexTreeList(DATABASE_NAME, Serializer.STRING).createOrOpen();
+
+ counter = new AtomicInteger(this.urlDuplicateQueue.size());
+ this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
+ for (String url : this.urlDuplicateQueue){
+ bloomFilter.put(url);
+ }
+
+ }
+
+ @Override
+ public boolean isDuplicate(Request request, Task task) {
+ String url = request.getUrl();
+ boolean isDuplicate = bloomFilter.mightContain(url);
+ if (!isDuplicate) {
+ bloomFilter.put(url);
+ urlDuplicateQueue.add(url);
+ this.db.commit();
+ counter.incrementAndGet();
+ }
+ return isDuplicate;
+ }
+
+ @Override
+ public void resetDuplicateCheck(Task task) {
+ this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
+ this.urlDuplicateQueue.clear();
+ }
+
+ @Override
+ public int getTotalRequestsCount(Task task) {
+ return counter.get();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java
new file mode 100644
index 0000000000000000000000000000000000000000..4cee18afd703c7ae64437616bc165a8aba1f1d32
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java
@@ -0,0 +1,85 @@
+package us.codecraft.webmagic.recover;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.commons.lang3.StringUtils;
+import org.mapdb.DB;
+import org.mapdb.DBMaker;
+import org.mapdb.IndexTreeList;
+import org.mapdb.Serializer;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+import java.io.IOException;
+
+/**
+ * @author :linweisen
+ */
+public class MmapQueueScheduler extends DuplicateRemovedScheduler {
+
+ private DB db;
+
+ private static String DATABASE_NAME = "queue";
+
+ private IndexTreeList queue;
+
+ private static ObjectMapper mapper;
+
+ public MmapQueueScheduler(DuplicateRemover duplicateRemover, String path) {
+ super.setDuplicateRemover(duplicateRemover);
+
+ String queuePath = path;
+
+ DB db = DBMaker.fileDB(queuePath)
+ .fileMmapEnableIfSupported()
+ .fileMmapPreclearDisable()
+ .cleanerHackEnable()
+ .closeOnJvmShutdown()
+ .transactionEnable()
+ .concurrencyScale(128)
+ .make();
+ this.db = db;
+ this.mapper = new ObjectMapper();
+ this.queue = db.indexTreeList(MmapQueueScheduler.DATABASE_NAME, Serializer.STRING).createOrOpen();
+ }
+
+ @Override
+ public Request poll(Task task) {
+ if (this.queue.size() > 0){
+ String s = queue.remove(0);
+ return fromJson(s, Request.class);
+ }else{
+ return null;
+ }
+
+ }
+
+ @Override
+ public void pushWhenNoDuplicate(Request request, Task task) {
+ queue.add(toJson(request));
+ this.db.commit();
+ }
+
+ public String toJson(Object object) {
+ try {
+ return mapper.writeValueAsString(object);
+ } catch (IOException e) {
+ logger.warn("write to json string error:" + object, e);
+ return null;
+ }
+ }
+
+ public T fromJson(String jsonString, Class clazz) {
+ if (StringUtils.isEmpty(jsonString)) {
+ return null;
+ }
+ try {
+ return mapper.readValue(jsonString, clazz);
+ } catch (IOException e) {
+ logger.warn("parse json string error:" + jsonString, e);
+ return null;
+ }
+ }
+
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java
new file mode 100644
index 0000000000000000000000000000000000000000..4fb91a0d25aa34a587f5a6c9ad51e9e4f88c1668
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java
@@ -0,0 +1,22 @@
+package us.codecraft.webmagic.recover;
+
+
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.samples.SinaBlogProcessor;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public class RecoverSample {
+
+ public static void main(String[] args) {
+ String storage = "queue";
+ String duplicate = "duplicate";
+ Spider spider = new Spider(new SinaBlogProcessor());
+ DuplicateRemover remover = new DuplicateStorageRemover(duplicate);
+ spider.setScheduler(new MmapQueueScheduler(remover, storage));
+ spider.addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html")
+ .run();
+ }
+}
diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml
index 95f706ed5beb5f92f03daec44920c3f58736c13b..119e50f15ea0c5193ebe89dcfa99a32daaf86945 100644
--- a/webmagic-saxon/pom.xml
+++ b/webmagic-saxon/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.7.3
+ 0.7.5
4.0.0
@@ -11,19 +11,17 @@
- us.codecraft
+ ${project.groupId}
webmagic-core
${project.version}
net.sourceforge.htmlcleaner
htmlcleaner
- 2.5
net.sf.saxon
Saxon-HE
- 9.5.1-1
junit
@@ -34,7 +32,9 @@
+ org.apache.maven.plugins
maven-deploy-plugin
+ 3.0.0-M1
true
diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
index 98b1efe4b3c1d022b4163f160c50126f6a311156..1f1f0a5723839de1cc2e25474cdfeb83531d3e4c 100644
--- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
+++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
@@ -1,15 +1,11 @@
package us.codecraft.webmagic.selector;
-import net.sf.saxon.lib.NamespaceConstant;
-import net.sf.saxon.xpath.XPathEvaluator;
-import org.apache.log4j.Logger;
-import org.htmlcleaner.CleanerProperties;
-import org.htmlcleaner.DomSerializer;
-import org.htmlcleaner.HtmlCleaner;
-import org.htmlcleaner.TagNode;
-import org.w3c.dom.Document;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
import javax.xml.namespace.NamespaceContext;
import javax.xml.transform.OutputKeys;
@@ -20,12 +16,19 @@ import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
+
+import org.htmlcleaner.CleanerProperties;
+import org.htmlcleaner.DomSerializer;
+import org.htmlcleaner.HtmlCleaner;
+import org.htmlcleaner.TagNode;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import net.sf.saxon.lib.NamespaceConstant;
+import net.sf.saxon.xpath.XPathEvaluator;
/**
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
@@ -40,7 +43,7 @@ public class Xpath2Selector implements Selector {
private XPathExpression xPathExpression;
- private Logger logger = Logger.getLogger(getClass());
+ private Logger logger = LoggerFactory.getLogger(getClass());
public Xpath2Selector(String xpathStr) {
this.xpathStr = xpathStr;
diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
index aa3765a0c2cdf2abdd85dde790856c58d685b06c..32906b57a55895f7663dc8ad3ceaa91bdff92fd0 100644
--- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
+++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
@@ -1,5 +1,7 @@
package us.codecraft.webmagic.selector;
+import java.util.List;
+
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
@@ -8,6 +10,7 @@ import org.jsoup.nodes.Document;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
+
import us.codecraft.xsoup.XPathEvaluator;
import us.codecraft.xsoup.Xsoup;
@@ -1367,15 +1370,19 @@ public class XpathSelectorTest {
public void testXPath2() {
String text = "眉山:扎实推进农业农村工作 促农持续增收
\n" +
"2013-07-31 23:29:45 来源:眉山网 责任编辑:张斯炜
";
- XpathSelector xpathSelector = new XpathSelector("//h1/text()");
- System.out.println(xpathSelector.select(text));
+ Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()");
+ Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text));
}
@Test
public void testXpath2Selector() {
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
String select = xpath2Selector.select(html);
- Assert.assertNotNull(select);
+ Assert.assertEquals("http://www.oschina.net/", select);
+
+ List selectList = xpath2Selector.selectList(html);
+ Assert.assertEquals(113, selectList.size());
+ Assert.assertEquals("http://www.oschina.net/", selectList.get(0));
}
@Ignore("take long time")
diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml
old mode 100755
new mode 100644
index 22956cb55ed22e9238d38871eabe8b71c1ac80ec..f0c16795552c186c3d16d5632cd6274e150870db
--- a/webmagic-scripts/pom.xml
+++ b/webmagic-scripts/pom.xml
@@ -3,41 +3,32 @@
webmagic-parent
us.codecraft
- 0.7.3
+ 0.7.5
4.0.0
- us.codecraft
webmagic-scripts
- 1.1.2-2
+ 1.6.0
org.jruby
jruby
- 1.7.6
org.jetbrains.kotlin
kotlin-stdlib
${kotlin.version}
-
- org.codehaus.groovy
- groovy-all
- 2.1.6
-
- org.python
+ org.python
jython
- 2.5.3
commons-cli
commons-cli
- 1.2
junit
@@ -45,12 +36,16 @@
test
- us.codecraft
+ ${project.groupId}
webmagic-core
${project.version}
- us.codecraft
+ org.slf4j
+ slf4j-log4j12
+
+
+ ${project.groupId}
webmagic-extension
${project.version}
@@ -59,21 +54,6 @@
${project.basedir}/src/main/java
-
- maven-compiler-plugin
-
- 1.6
- 1.6
- UTF-8
-
-
-
- org.apache.maven.plugins
- maven-resources-plugin
-
- UTF-8
-
-
org.apache.maven.plugins
maven-jar-plugin
diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml
index 1cbf59216a812072848e31e4dee7523a408901f9..42a6da905de3018f0863607a1cb2656ede884631 100644
--- a/webmagic-selenium/pom.xml
+++ b/webmagic-selenium/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.7.3
+ 0.7.5
4.0.0
@@ -13,21 +13,16 @@
org.seleniumhq.selenium
selenium-java
- 2.41.0
- us.codecraft
+ ${project.groupId}
webmagic-core
${project.version}
com.github.detro
phantomjsdriver
- 1.2.0
-
-
-
junit
junit
@@ -37,7 +32,9 @@
+ org.apache.maven.plugins
maven-deploy-plugin
+ 3.0.0-M1
true
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
index f45f7e2a8a52ce287138f793ea825fe08f3792fe..cce293fc9e18f96c470b588b26b9da0d9018dae1 100644
--- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
+++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
@@ -1,10 +1,12 @@
package us.codecraft.webmagic.downloader.selenium;
-import org.apache.log4j.Logger;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
@@ -29,7 +31,7 @@ public class SeleniumDownloader implements Downloader, Closeable {
private volatile WebDriverPool webDriverPool;
- private Logger logger = Logger.getLogger(getClass());
+ private Logger logger = LoggerFactory.getLogger(getClass());
private int sleepTime = 0;
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java
index 1472cb32c60a00cfc11f531283b3cf919c415436..e1d9dd03904fade0ae91ac981d21bfdd7beac025 100644
--- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java
+++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java
@@ -1,6 +1,5 @@
package us.codecraft.webmagic.downloader.selenium;
-import org.apache.log4j.Logger;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
@@ -8,6 +7,8 @@ import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.io.FileReader;
import java.io.IOException;
@@ -27,7 +28,7 @@ import java.util.concurrent.atomic.AtomicInteger;
* Time: 下午1:41
*/
class WebDriverPool {
- private Logger logger = Logger.getLogger(getClass());
+ private Logger logger = LoggerFactory.getLogger(getClass());
private final static int DEFAULT_CAPACITY = 5;