From c5a037a8072575b0938bfc26b0e326931f7a6b16 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 22 Jul 2021 13:02:46 +0800 Subject: [PATCH 1/5] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 51e6fdb..cda7ad1 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index ec718a1..049477c 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 16ed1b4..e6e6068 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.7.5 + 0.7.6-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 85d5c63..741b081 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index dda1821..c5582c0 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 119e50f..d4d3efa 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 1aca5b3..fe4ef68 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 42a6da9..be36376 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 -- Gitee From ab5d81a6b6ab215e3450cb2fde94df12c5e49544 Mon Sep 17 00:00:00 2001 From: "carl.don:tjr" Date: Wed, 4 Aug 2021 17:17:22 +0800 Subject: [PATCH 2/5] perfect Spider.run to avoid some rare concurrent issue, change the Spider.emptySleepTime to long type --- .../java/us/codecraft/webmagic/Spider.java | 89 +++++++++++++------ 1 file changed, 60 insertions(+), 29 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 5940e73..65c0cee 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -106,7 +106,7 @@ public class Spider implements Runnable, Task { private Date startTime; - private int emptySleepTime = 30000; + private long emptySleepTime = 30000; /** * create a spider with pageProcessor. @@ -305,32 +305,52 @@ public class Spider implements Runnable, Task { public void run() { checkRunningStat(); initComponent(); - logger.info("Spider {} started!",getUUID()); + logger.info("Spider {} started!", getUUID()); + // interrupt won't be necessarily detected while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { - final Request request = scheduler.poll(this); - if (request == null) { - if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { - break; - } - // wait until new url added - waitNewUrl(); - } else { - threadPool.execute(new Runnable() { - @Override - public void run() { - try { - processRequest(request); - onSuccess(request); - } catch (Exception e) { - onError(request, e); - logger.error("process request " + request + " error", e); - } finally { - pageCount.incrementAndGet(); - signalNewUrl(); + Request poll = scheduler.poll(this); + if (poll == null) { + if (threadPool.getThreadAlive() == 0) { + //no alive thread anymore , try again + poll = scheduler.poll(this); + if(poll==null) { + if (exitWhenComplete) { + break; + }else{ + // wait + try { + Thread.sleep(emptySleepTime); + continue; + } catch (InterruptedException e) { + break; + } } } - }); + }else { + // wait until new url added, + if(waitNewUrl()) + //if interrupted + break; + continue; + } } + final Request request = poll; + //this may swallow the interruption + threadPool.execute(new Runnable() { + @Override + public void run() { + try { + processRequest(request); + onSuccess(request); + } catch (Exception e) { + onError(request,e); + logger.error("process request " + request + " error", e); + } finally { + pageCount.incrementAndGet(); + signalNewUrl(); + } + } + }); } stat.set(STAT_STOPPED); // release some resources @@ -565,16 +585,24 @@ public class Spider implements Runnable, Task { return this; } - private void waitNewUrl() { + /** + * + * @return isInterrupted + */ + private boolean waitNewUrl() { + // now there may not be any thread live newUrlLock.lock(); try { - //double check - if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { - return; + //double check,unnecessary, unless very fast concurrent + if (threadPool.getThreadAlive() == 0) { + return false; } + //wait for amount of time newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); + return false; } catch (InterruptedException e) { - logger.warn("waitNewUrl - interrupted, error {}", e); + // logger.warn("waitNewUrl - interrupted, error {}", e); + return true; } finally { newUrlLock.unlock(); } @@ -772,7 +800,10 @@ public class Spider implements Runnable, Task { * * @param emptySleepTime In MILLISECONDS. */ - public void setEmptySleepTime(int emptySleepTime) { + public void setEmptySleepTime(long emptySleepTime) { + if(emptySleepTime<=0){ + throw new IllegalArgumentException("emptySleepTime should be more than zero!"); + } this.emptySleepTime = emptySleepTime; } } -- Gitee From fcdb9074d69543b81fd350075d182ce1eeaf26ac Mon Sep 17 00:00:00 2001 From: "carl.don:tjr" Date: Wed, 4 Aug 2021 18:23:04 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E5=8C=96=20Spider.run=20?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/java/us/codecraft/webmagic/Spider.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 65c0cee..bc8bb94 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -313,10 +313,10 @@ public class Spider implements Runnable, Task { if (threadPool.getThreadAlive() == 0) { //no alive thread anymore , try again poll = scheduler.poll(this); - if(poll==null) { + if (poll == null) { if (exitWhenComplete) { break; - }else{ + } else { // wait try { Thread.sleep(emptySleepTime); @@ -326,9 +326,9 @@ public class Spider implements Runnable, Task { } } } - }else { + } else { // wait until new url added, - if(waitNewUrl()) + if (waitNewUrl()) //if interrupted break; continue; @@ -343,7 +343,7 @@ public class Spider implements Runnable, Task { processRequest(request); onSuccess(request); } catch (Exception e) { - onError(request,e); + onError(request, e); logger.error("process request " + request + " error", e); } finally { pageCount.incrementAndGet(); -- Gitee From 34da2fb3a02708b562ec747679ef0cd8d171a042 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 24 Oct 2021 23:20:38 +0800 Subject: [PATCH 4/5] Make PageProcessor#getSite be default method. Closes #1040. --- .../webmagic/processor/PageProcessor.java | 26 +++++++----- .../webmagic/processor/PageProcessorTest.java | 40 +++++++++++++++++++ 2 files changed, 56 insertions(+), 10 deletions(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java index 1fb125c..3d79b96 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java @@ -4,13 +4,16 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; /** - * Interface to be implemented to customize a crawler.
- *
+ * Interface to be implemented to customize a crawler. + * + *

* In PageProcessor, you can customize: - *
- * start urls and other settings in {@link Site}
- * how the urls to fetch are detected
- * how the data are extracted and stored
+ *

+ *
    + *
  • start URLs and other settings in {@link Site}
  • + *
  • how the URLs to fetch are detected
  • + *
  • how the data are extracted and stored
  • + *
* * @author code4crafter@gmail.com
* @see Site @@ -20,17 +23,20 @@ import us.codecraft.webmagic.Site; public interface PageProcessor { /** - * process the page, extract urls to fetch, extract the data and store + * Processes the page, extract URLs to fetch, extract the data and store. * * @param page page */ - public void process(Page page); + void process(Page page); /** - * get the site settings + * Returns the site settings. * * @return site * @see Site */ - public Site getSite(); + default Site getSite() { + return Site.me(); + } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java new file mode 100644 index 0000000..ebb1225 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java @@ -0,0 +1,40 @@ +package us.codecraft.webmagic.processor; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; + +public class PageProcessorTest { + + @Test + public void testGetSite() { + Site actualSite = new PageProcessor() { + + @Override + public void process(Page page) { + } + + }.getSite(); + + assertEquals(Site.me(), actualSite); + + actualSite = new PageProcessor() { + + @Override + public void process(Page page) { + } + + @Override + public Site getSite() { + return Site.me().setTimeOut(123); + }; + + }.getSite(); + + assertEquals(Site.me().setTimeOut(123), actualSite); + } + +} -- Gitee From 54da7af17eaffeb54360b4ed81639d84bd064281 Mon Sep 17 00:00:00 2001 From: David Hsing Date: Tue, 3 May 2022 17:42:42 +0800 Subject: [PATCH 5/5] change dependency versions into properties change dependency versions into properties update commons-collections from 3.x to 4.4 --- pom.xml | 78 ++++++++++++------- webmagic-core/pom.xml | 4 +- .../java/us/codecraft/webmagic/Spider.java | 26 ++++--- .../webmagic/selector/AbstractSelectable.java | 2 +- .../webmagic/selector/CssSelector.java | 8 +- .../webmagic/selector/JsonPathSelector.java | 20 +++-- .../webmagic/selector/XpathSelector.java | 6 +- .../downloader/HttpClientDownloaderTest.java | 31 +++++--- .../downloader/MockGithubDownloader.java | 8 +- .../codecraft/webmagic/model/PageMocker.java | 8 +- .../webmagic/samples/AngularJSProcessor.java | 6 +- .../samples/InfoQMiniBookProcessor.java | 2 +- .../webmagic/scripts/ScriptProcessor.java | 19 ++--- .../scripts/ScriptProcessorBuilder.java | 8 +- 14 files changed, 139 insertions(+), 87 deletions(-) diff --git a/pom.xml b/pom.xml index cda7ad1..3774b4b 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,31 @@ UTF-8 1.8 1.8 + 3.18.1 + 1.4 + 4.4 + 2.11.0 + 3.12.0 + 1.2.75 + 3.0.10 + 31.1-jre + 2.26 + 4.5.13 + 4.4.14 + 3.7.1 + 9.2.14.0 + 2.6.0 + 4.13.2 + 2.7.2 + 1.2.17 + 1.10.19 + 1.1.0 + 1.2.0 + 10.3 + 3.141.59 + 1.7.36 4.0.0.RELEASE + 0.3.2
webmagic-parent webmagic-parent @@ -58,59 +82,59 @@ junit junit - 4.13.1 + ${junit.version} test org.mockito mockito-all - 1.10.19 + ${mockito-all.version} test org.apache.httpcomponents httpclient - 4.5.13 + ${httpclient.version} org.apache.httpcomponents httpcore - 4.4.14 + ${httpcore.version} com.google.guava guava - 30.1-jre + ${guava.version} com.jayway.jsonpath json-path - 2.5.0 + ${json-path.version} org.slf4j slf4j-api - 1.7.30 + ${slf4j.version} org.slf4j slf4j-log4j12 - 1.7.30 + ${slf4j.version} us.codecraft xsoup - 0.3.2 + ${xsoup.version} com.alibaba fastjson - 1.2.75 + ${fastjson.version} com.github.dreamhead moco-core - 1.1.0 + ${moco.version} test @@ -122,73 +146,73 @@ log4j log4j - 1.2.17 + ${log4j.version} org.assertj assertj-core - 3.18.1 + ${assertj.version} test org.apache.commons commons-lang3 - 3.11 + ${commons-lang3.version} - commons-collections - commons-collections - 3.2.2 + org.apache.commons + commons-collections4 + ${commons-collections4.version} commons-io commons-io - 2.8.0 + ${commons-io.version} org.codehaus.groovy groovy-all - 3.0.7 + ${groovy-all.version} org.jruby jruby - 9.2.14.0 + ${jruby.version} org.python jython - 2.7.2 + ${jython.version} org.seleniumhq.selenium selenium-java - 3.141.59 + ${selenium-java.version} net.sf.saxon Saxon-HE - 10.3 + ${saxon-he.version} net.sourceforge.htmlcleaner htmlcleaner - 2.9 + ${htmlcleaner.version} com.github.detro phantomjsdriver - 1.2.0 + ${phantomjsdriver.version} commons-cli commons-cli - 1.4 + ${commons-cli.version} redis.clients jedis - 3.6.0 + ${jedis.version} diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 049477c..64b8013 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -52,8 +52,8 @@ - commons-collections - commons-collections + org.apache.commons + commons-collections4 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index bc8bb94..00091c9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,6 +1,20 @@ package us.codecraft.webmagic; -import org.apache.commons.collections.CollectionUtils; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.SerializationUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,16 +31,6 @@ import us.codecraft.webmagic.thread.CountableThreadPool; import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.WMCollections; -import java.io.Closeable; -import java.io.IOException; -import java.util.*; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.Condition; -import java.util.concurrent.locks.ReentrantLock; - /** * Entrance of a crawler.
* A spider contains four modules: Downloader, Scheduler, PageProcessor and diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java index e2bb552..8775af1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java @@ -1,9 +1,9 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; import java.util.ArrayList; import java.util.List; +import org.apache.commons.collections4.CollectionUtils; /** * @author code4crafer@gmail.com diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 6a638db..cfe5547 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -1,14 +1,14 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; + +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; -import java.util.ArrayList; -import java.util.List; - /** * CSS selector. Based on Jsoup. * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index f5c0bae..aa9a903 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -1,11 +1,11 @@ package us.codecraft.webmagic.selector; -import com.alibaba.fastjson.JSON; -import com.jayway.jsonpath.JsonPath; import java.util.ArrayList; import java.util.List; import java.util.Map; +import com.alibaba.fastjson.JSON; +import com.jayway.jsonpath.JsonPath; /** * JsonPath selector.
@@ -16,15 +16,20 @@ import java.util.Map; */ public class JsonPathSelector implements Selector { - private String jsonPathStr; + private final String jsonPathStr; - private JsonPath jsonPath; + private final JsonPath jsonPath; public JsonPathSelector(String jsonPathStr) { this.jsonPathStr = jsonPathStr; this.jsonPath = JsonPath.compile(this.jsonPathStr); } + @SuppressWarnings("unused") + public String getJsonPathStr() { + return jsonPathStr; + } + @Override public String select(String text) { Object object = jsonPath.read(text); @@ -32,8 +37,8 @@ public class JsonPathSelector implements Selector { return null; } if (object instanceof List) { - List list = (List) object; - if (list != null && list.size() > 0) { + List list = (List) object; + if (list.size() > 0) { return toString(list.iterator().next()); } } @@ -49,8 +54,9 @@ public class JsonPathSelector implements Selector { } @Override + @SuppressWarnings("unchecked") public List selectList(String text) { - List list = new ArrayList(); + List list = new ArrayList<>(); Object object = jsonPath.read(text); if (object == null) { return list; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index 8a980a5..4fa1469 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -1,12 +1,12 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; + +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import org.jsoup.nodes.Element; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; -import java.util.List; - /** * XPath selector based on Xsoup.
* diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index ece0600..780ca75 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -1,9 +1,10 @@ package us.codecraft.webmagic.downloader; -import com.github.dreamhead.moco.HttpServer; -import com.github.dreamhead.moco.Runnable; -import com.github.dreamhead.moco.Runner; -import org.apache.commons.collections.map.HashedMap; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.Map; +import org.apache.commons.collections4.map.HashedMap; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; @@ -11,6 +12,9 @@ import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.junit.Test; +import com.github.dreamhead.moco.HttpServer; +import com.github.dreamhead.moco.Runnable; +import com.github.dreamhead.moco.Runner; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -21,12 +25,19 @@ import us.codecraft.webmagic.proxy.SimpleProxyProvider; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; - -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.util.Map; - -import static com.github.dreamhead.moco.Moco.*; +import static com.github.dreamhead.moco.Moco.and; +import static com.github.dreamhead.moco.Moco.by; +import static com.github.dreamhead.moco.Moco.cookie; +import static com.github.dreamhead.moco.Moco.eq; +import static com.github.dreamhead.moco.Moco.form; +import static com.github.dreamhead.moco.Moco.header; +import static com.github.dreamhead.moco.Moco.httpServer; +import static com.github.dreamhead.moco.Moco.method; +import static com.github.dreamhead.moco.Moco.not; +import static com.github.dreamhead.moco.Moco.query; +import static com.github.dreamhead.moco.Moco.text; +import static com.github.dreamhead.moco.Moco.uri; +import static com.github.dreamhead.moco.Moco.with; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 3aa742c..58dd3a6 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -1,13 +1,15 @@ package us.codecraft.webmagic.downloader; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.PlainText; -import java.io.IOException; -import java.io.InputStream; /** * @author code4crafter@gmail.com @@ -19,7 +21,7 @@ public class MockGithubDownloader implements Downloader { Page page = new Page(); InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html"); try { - page.setRawText(IOUtils.toString(resourceAsStream)); + page.setRawText(IOUtils.toString(resourceAsStream, Charset.defaultCharset())); } catch (IOException e) { e.printStackTrace(); } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java index 4b0c133..0451edc 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java @@ -1,11 +1,13 @@ package us.codecraft.webmagic.model; + +import java.io.IOException; +import java.nio.charset.Charset; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.selector.PlainText; -import java.io.IOException; /** * @author code4crafter@gmail.com @@ -16,7 +18,7 @@ public class PageMocker { public Page getMockJsonPage() throws IOException { Page page = new Page(); - page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"))); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"), Charset.defaultCharset())); page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic")); page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic")); return page; @@ -24,7 +26,7 @@ public class PageMocker { public Page getMockPage() throws IOException { Page page = new Page(); - page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"))); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"), Charset.defaultCharset())); page.setRequest(new Request("http://webmagic.io/list/0")); page.setUrl(new PlainText("http://webmagic.io/list/0")); return page; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java index ab560e4..46476bb 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java @@ -1,14 +1,14 @@ package us.codecraft.webmagic.samples; -import org.apache.commons.collections.CollectionUtils; + +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.JsonPathSelector; -import java.util.List; - /** * @author code4crafter@gmail.com * @since 0.5.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index 280f8f1..33dd6aa 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -1,6 +1,6 @@ package us.codecraft.webmagic.samples; -import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.collections4.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java index 1822318..78c9d87 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java @@ -1,5 +1,14 @@ package us.codecraft.webmagic.scripts; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.Iterator; +import java.util.Map; +import javax.script.ScriptContext; +import javax.script.ScriptEngine; +import javax.script.ScriptException; import org.apache.commons.io.IOUtils; import org.jruby.RubyHash; import org.python.core.PyDictionary; @@ -7,14 +16,6 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; -import javax.script.ScriptContext; -import javax.script.ScriptEngine; -import javax.script.ScriptException; -import java.io.IOException; -import java.io.InputStream; -import java.util.Iterator; -import java.util.Map; - /** * @author code4crafter@gmail.com * @since 0.4.1 @@ -39,7 +40,7 @@ public class ScriptProcessor implements PageProcessor { enginePool = new ScriptEnginePool(language, threadNum); InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(language.getDefineFile()); try { - defines = IOUtils.toString(resourceAsStream); + defines = IOUtils.toString(resourceAsStream, Charset.defaultCharset()); } catch (IOException e) { throw new IllegalArgumentException(e); } diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java index 76b3e86..4691528 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java @@ -1,10 +1,12 @@ package us.codecraft.webmagic.scripts; -import org.apache.commons.io.IOUtils; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.Charset; +import org.apache.commons.io.IOUtils; + /** * @author code4crafter@gmail.com @@ -35,7 +37,7 @@ public class ScriptProcessorBuilder { public ScriptProcessorBuilder scriptFromFile(String fileName) { try { InputStream resourceAsStream = new FileInputStream(fileName); - this.script = IOUtils.toString(resourceAsStream); + this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset()); } catch (IOException e) { //wrap IOException because I prefer a runtime exception... throw new IllegalArgumentException(e); @@ -46,7 +48,7 @@ public class ScriptProcessorBuilder { public ScriptProcessorBuilder scriptFromClassPathFile(String fileName) { try { InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName); - this.script = IOUtils.toString(resourceAsStream); + this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset()); } catch (IOException e) { //wrap IOException because I prefer a runtime exception... throw new IllegalArgumentException(e); -- Gitee