diff --git a/pom.xml b/pom.xml index 51e6fdb85fd17ff80a056502b1a1df551ff99ac6..3774b4b25f80338b4b2fd82afd2da55b198eb9c2 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 pom @@ -9,7 +9,31 @@ UTF-8 1.8 1.8 + 3.18.1 + 1.4 + 4.4 + 2.11.0 + 3.12.0 + 1.2.75 + 3.0.10 + 31.1-jre + 2.26 + 4.5.13 + 4.4.14 + 3.7.1 + 9.2.14.0 + 2.6.0 + 4.13.2 + 2.7.2 + 1.2.17 + 1.10.19 + 1.1.0 + 1.2.0 + 10.3 + 3.141.59 + 1.7.36 4.0.0.RELEASE + 0.3.2 webmagic-parent webmagic-parent @@ -58,59 +82,59 @@ junit junit - 4.13.1 + ${junit.version} test org.mockito mockito-all - 1.10.19 + ${mockito-all.version} test org.apache.httpcomponents httpclient - 4.5.13 + ${httpclient.version} org.apache.httpcomponents httpcore - 4.4.14 + ${httpcore.version} com.google.guava guava - 30.1-jre + ${guava.version} com.jayway.jsonpath json-path - 2.5.0 + ${json-path.version} org.slf4j slf4j-api - 1.7.30 + ${slf4j.version} org.slf4j slf4j-log4j12 - 1.7.30 + ${slf4j.version} us.codecraft xsoup - 0.3.2 + ${xsoup.version} com.alibaba fastjson - 1.2.75 + ${fastjson.version} com.github.dreamhead moco-core - 1.1.0 + ${moco.version} test @@ -122,73 +146,73 @@ log4j log4j - 1.2.17 + ${log4j.version} org.assertj assertj-core - 3.18.1 + ${assertj.version} test org.apache.commons commons-lang3 - 3.11 + ${commons-lang3.version} - commons-collections - commons-collections - 3.2.2 + org.apache.commons + commons-collections4 + ${commons-collections4.version} commons-io commons-io - 2.8.0 + ${commons-io.version} org.codehaus.groovy groovy-all - 3.0.7 + ${groovy-all.version} org.jruby jruby - 9.2.14.0 + ${jruby.version} org.python jython - 2.7.2 + ${jython.version} org.seleniumhq.selenium selenium-java - 3.141.59 + ${selenium-java.version} net.sf.saxon Saxon-HE - 10.3 + ${saxon-he.version} net.sourceforge.htmlcleaner htmlcleaner - 2.9 + ${htmlcleaner.version} com.github.detro phantomjsdriver - 1.2.0 + ${phantomjsdriver.version} commons-cli commons-cli - 1.4 + ${commons-cli.version} redis.clients jedis - 3.6.0 + ${jedis.version} diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index ec718a1e3e80a5cebe583625d4a13e6b6b3c77fe..64b8013f244bff67d9d6fca674d2b73e43814c1e 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 @@ -52,8 +52,8 @@ - commons-collections - commons-collections + org.apache.commons + commons-collections4 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 5940e738db9bfe0dae27954306beacddae967f6e..00091c90a35b42f787e0e762616d8626867bcdae 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,6 +1,20 @@ package us.codecraft.webmagic; -import org.apache.commons.collections.CollectionUtils; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.SerializationUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,16 +31,6 @@ import us.codecraft.webmagic.thread.CountableThreadPool; import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.WMCollections; -import java.io.Closeable; -import java.io.IOException; -import java.util.*; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.Condition; -import java.util.concurrent.locks.ReentrantLock; - /** * Entrance of a crawler.
* A spider contains four modules: Downloader, Scheduler, PageProcessor and @@ -106,7 +110,7 @@ public class Spider implements Runnable, Task { private Date startTime; - private int emptySleepTime = 30000; + private long emptySleepTime = 30000; /** * create a spider with pageProcessor. @@ -305,32 +309,52 @@ public class Spider implements Runnable, Task { public void run() { checkRunningStat(); initComponent(); - logger.info("Spider {} started!",getUUID()); + logger.info("Spider {} started!", getUUID()); + // interrupt won't be necessarily detected while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { - final Request request = scheduler.poll(this); - if (request == null) { - if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { - break; - } - // wait until new url added - waitNewUrl(); - } else { - threadPool.execute(new Runnable() { - @Override - public void run() { - try { - processRequest(request); - onSuccess(request); - } catch (Exception e) { - onError(request, e); - logger.error("process request " + request + " error", e); - } finally { - pageCount.incrementAndGet(); - signalNewUrl(); + Request poll = scheduler.poll(this); + if (poll == null) { + if (threadPool.getThreadAlive() == 0) { + //no alive thread anymore , try again + poll = scheduler.poll(this); + if (poll == null) { + if (exitWhenComplete) { + break; + } else { + // wait + try { + Thread.sleep(emptySleepTime); + continue; + } catch (InterruptedException e) { + break; + } } } - }); + } else { + // wait until new url added, + if (waitNewUrl()) + //if interrupted + break; + continue; + } } + final Request request = poll; + //this may swallow the interruption + threadPool.execute(new Runnable() { + @Override + public void run() { + try { + processRequest(request); + onSuccess(request); + } catch (Exception e) { + onError(request, e); + logger.error("process request " + request + " error", e); + } finally { + pageCount.incrementAndGet(); + signalNewUrl(); + } + } + }); } stat.set(STAT_STOPPED); // release some resources @@ -565,16 +589,24 @@ public class Spider implements Runnable, Task { return this; } - private void waitNewUrl() { + /** + * + * @return isInterrupted + */ + private boolean waitNewUrl() { + // now there may not be any thread live newUrlLock.lock(); try { - //double check - if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { - return; + //double check,unnecessary, unless very fast concurrent + if (threadPool.getThreadAlive() == 0) { + return false; } + //wait for amount of time newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); + return false; } catch (InterruptedException e) { - logger.warn("waitNewUrl - interrupted, error {}", e); + // logger.warn("waitNewUrl - interrupted, error {}", e); + return true; } finally { newUrlLock.unlock(); } @@ -772,7 +804,10 @@ public class Spider implements Runnable, Task { * * @param emptySleepTime In MILLISECONDS. */ - public void setEmptySleepTime(int emptySleepTime) { + public void setEmptySleepTime(long emptySleepTime) { + if(emptySleepTime<=0){ + throw new IllegalArgumentException("emptySleepTime should be more than zero!"); + } this.emptySleepTime = emptySleepTime; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java index 1fb125c726f3258ad170b87701d6d50b9bf36b6e..3d79b96a9f74c25fbdfb1c099bab89094d4e1f5a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java @@ -4,13 +4,16 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; /** - * Interface to be implemented to customize a crawler.
- *
+ * Interface to be implemented to customize a crawler. + * + *

* In PageProcessor, you can customize: - *
- * start urls and other settings in {@link Site}
- * how the urls to fetch are detected
- * how the data are extracted and stored
+ *

+ * * * @author code4crafter@gmail.com
* @see Site @@ -20,17 +23,20 @@ import us.codecraft.webmagic.Site; public interface PageProcessor { /** - * process the page, extract urls to fetch, extract the data and store + * Processes the page, extract URLs to fetch, extract the data and store. * * @param page page */ - public void process(Page page); + void process(Page page); /** - * get the site settings + * Returns the site settings. * * @return site * @see Site */ - public Site getSite(); + default Site getSite() { + return Site.me(); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java index e2bb552151edba8bcc33430d8cdb4e35015bd92c..8775af1088892703aaaca00b70f498ee034a6518 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java @@ -1,9 +1,9 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; import java.util.ArrayList; import java.util.List; +import org.apache.commons.collections4.CollectionUtils; /** * @author code4crafer@gmail.com diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 6a638dbff5cbd6f0b80063e9049d85d2ef42b6be..cfe55472aeb3be5a65e3ba7897a36513d4d8a5df 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -1,14 +1,14 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; + +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; -import java.util.ArrayList; -import java.util.List; - /** * CSS selector. Based on Jsoup. * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index f5c0baeb591a132d8272dc795e537f77f8c03c89..aa9a903f7fae102084663d9d179ad8949992f5b6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -1,11 +1,11 @@ package us.codecraft.webmagic.selector; -import com.alibaba.fastjson.JSON; -import com.jayway.jsonpath.JsonPath; import java.util.ArrayList; import java.util.List; import java.util.Map; +import com.alibaba.fastjson.JSON; +import com.jayway.jsonpath.JsonPath; /** * JsonPath selector.
@@ -16,15 +16,20 @@ import java.util.Map; */ public class JsonPathSelector implements Selector { - private String jsonPathStr; + private final String jsonPathStr; - private JsonPath jsonPath; + private final JsonPath jsonPath; public JsonPathSelector(String jsonPathStr) { this.jsonPathStr = jsonPathStr; this.jsonPath = JsonPath.compile(this.jsonPathStr); } + @SuppressWarnings("unused") + public String getJsonPathStr() { + return jsonPathStr; + } + @Override public String select(String text) { Object object = jsonPath.read(text); @@ -32,8 +37,8 @@ public class JsonPathSelector implements Selector { return null; } if (object instanceof List) { - List list = (List) object; - if (list != null && list.size() > 0) { + List list = (List) object; + if (list.size() > 0) { return toString(list.iterator().next()); } } @@ -49,8 +54,9 @@ public class JsonPathSelector implements Selector { } @Override + @SuppressWarnings("unchecked") public List selectList(String text) { - List list = new ArrayList(); + List list = new ArrayList<>(); Object object = jsonPath.read(text); if (object == null) { return list; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index 8a980a50d00b3f1187d2815f5616d6e5bd730e70..4fa14699edfe940132acd12a451b10b9bb091e04 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -1,12 +1,12 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; + +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import org.jsoup.nodes.Element; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; -import java.util.List; - /** * XPath selector based on Xsoup.
* diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index ece060003316e0b79980a30a5adaf75b7ca86d25..780ca7529aa03133922593cbae1397ba8858f171 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -1,9 +1,10 @@ package us.codecraft.webmagic.downloader; -import com.github.dreamhead.moco.HttpServer; -import com.github.dreamhead.moco.Runnable; -import com.github.dreamhead.moco.Runner; -import org.apache.commons.collections.map.HashedMap; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.Map; +import org.apache.commons.collections4.map.HashedMap; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; @@ -11,6 +12,9 @@ import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.junit.Test; +import com.github.dreamhead.moco.HttpServer; +import com.github.dreamhead.moco.Runnable; +import com.github.dreamhead.moco.Runner; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -21,12 +25,19 @@ import us.codecraft.webmagic.proxy.SimpleProxyProvider; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; - -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.util.Map; - -import static com.github.dreamhead.moco.Moco.*; +import static com.github.dreamhead.moco.Moco.and; +import static com.github.dreamhead.moco.Moco.by; +import static com.github.dreamhead.moco.Moco.cookie; +import static com.github.dreamhead.moco.Moco.eq; +import static com.github.dreamhead.moco.Moco.form; +import static com.github.dreamhead.moco.Moco.header; +import static com.github.dreamhead.moco.Moco.httpServer; +import static com.github.dreamhead.moco.Moco.method; +import static com.github.dreamhead.moco.Moco.not; +import static com.github.dreamhead.moco.Moco.query; +import static com.github.dreamhead.moco.Moco.text; +import static com.github.dreamhead.moco.Moco.uri; +import static com.github.dreamhead.moco.Moco.with; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 3aa742c107a408b173db62e6a6575fb2f3f2a596..58dd3a6fade1c13b68d8885e874bae3dbd83d831 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -1,13 +1,15 @@ package us.codecraft.webmagic.downloader; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.PlainText; -import java.io.IOException; -import java.io.InputStream; /** * @author code4crafter@gmail.com @@ -19,7 +21,7 @@ public class MockGithubDownloader implements Downloader { Page page = new Page(); InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html"); try { - page.setRawText(IOUtils.toString(resourceAsStream)); + page.setRawText(IOUtils.toString(resourceAsStream, Charset.defaultCharset())); } catch (IOException e) { e.printStackTrace(); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java new file mode 100644 index 0000000000000000000000000000000000000000..ebb1225ccaa4db3b058eb2b8d722e935045372e1 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java @@ -0,0 +1,40 @@ +package us.codecraft.webmagic.processor; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; + +public class PageProcessorTest { + + @Test + public void testGetSite() { + Site actualSite = new PageProcessor() { + + @Override + public void process(Page page) { + } + + }.getSite(); + + assertEquals(Site.me(), actualSite); + + actualSite = new PageProcessor() { + + @Override + public void process(Page page) { + } + + @Override + public Site getSite() { + return Site.me().setTimeOut(123); + }; + + }.getSite(); + + assertEquals(Site.me().setTimeOut(123), actualSite); + } + +} diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 16ed1b456412621f457b8171193acb5bfd883685..e6e606825b240b77e8e5354df728269601ab7711 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.7.5 + 0.7.6-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 85d5c639408304f7c1d51acd0c339d928ec89101..741b081d82c7a694eb576748718b135d37a01575 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java index 4b0c133cbd06c7af1144cac5080b620286f4aacb..0451edcfe43ffb39aab9eeb7c46de4c235992a0b 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java @@ -1,11 +1,13 @@ package us.codecraft.webmagic.model; + +import java.io.IOException; +import java.nio.charset.Charset; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.selector.PlainText; -import java.io.IOException; /** * @author code4crafter@gmail.com @@ -16,7 +18,7 @@ public class PageMocker { public Page getMockJsonPage() throws IOException { Page page = new Page(); - page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"))); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"), Charset.defaultCharset())); page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic")); page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic")); return page; @@ -24,7 +26,7 @@ public class PageMocker { public Page getMockPage() throws IOException { Page page = new Page(); - page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"))); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"), Charset.defaultCharset())); page.setRequest(new Request("http://webmagic.io/list/0")); page.setUrl(new PlainText("http://webmagic.io/list/0")); return page; diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index dda18216051f8bce0a0812fcfd3a5c0b74f1b483..c5582c0b3a6204716d2f73305e6efb8e106f411b 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java index ab560e451e3a9303d4f755d8506c6015fcd65492..46476bbc81ded615525f83c957ceec0447bde4a9 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java @@ -1,14 +1,14 @@ package us.codecraft.webmagic.samples; -import org.apache.commons.collections.CollectionUtils; + +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.JsonPathSelector; -import java.util.List; - /** * @author code4crafter@gmail.com * @since 0.5.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index 280f8f1861e3bf5faaeacddbea473ca7be11cf4d..33dd6aa3528b5d16bcb9d4c60a6f9deef3813cf1 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -1,6 +1,6 @@ package us.codecraft.webmagic.samples; -import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.collections4.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 119e50f15ea0c5193ebe89dcfa99a32daaf86945..d4d3efa18181bc6cf499f5fa837aee44c2094c22 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 1aca5b3aff7c6487ea204c60ac2f6c56d68286b3..fe4ef6840179bef176b43df0f6f616a4466af861 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java index 1822318c659bf7dc8a4dee034b618eb1f5c95157..78c9d87c8f6e11f2218ebe290fb50fe8ad7b36e9 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java @@ -1,5 +1,14 @@ package us.codecraft.webmagic.scripts; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.Iterator; +import java.util.Map; +import javax.script.ScriptContext; +import javax.script.ScriptEngine; +import javax.script.ScriptException; import org.apache.commons.io.IOUtils; import org.jruby.RubyHash; import org.python.core.PyDictionary; @@ -7,14 +16,6 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; -import javax.script.ScriptContext; -import javax.script.ScriptEngine; -import javax.script.ScriptException; -import java.io.IOException; -import java.io.InputStream; -import java.util.Iterator; -import java.util.Map; - /** * @author code4crafter@gmail.com * @since 0.4.1 @@ -39,7 +40,7 @@ public class ScriptProcessor implements PageProcessor { enginePool = new ScriptEnginePool(language, threadNum); InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(language.getDefineFile()); try { - defines = IOUtils.toString(resourceAsStream); + defines = IOUtils.toString(resourceAsStream, Charset.defaultCharset()); } catch (IOException e) { throw new IllegalArgumentException(e); } diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java index 76b3e86400f88ee672dde0bb764492eca93b054e..4691528ad6f926a80b375925d332467f673cb826 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java @@ -1,10 +1,12 @@ package us.codecraft.webmagic.scripts; -import org.apache.commons.io.IOUtils; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.Charset; +import org.apache.commons.io.IOUtils; + /** * @author code4crafter@gmail.com @@ -35,7 +37,7 @@ public class ScriptProcessorBuilder { public ScriptProcessorBuilder scriptFromFile(String fileName) { try { InputStream resourceAsStream = new FileInputStream(fileName); - this.script = IOUtils.toString(resourceAsStream); + this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset()); } catch (IOException e) { //wrap IOException because I prefer a runtime exception... throw new IllegalArgumentException(e); @@ -46,7 +48,7 @@ public class ScriptProcessorBuilder { public ScriptProcessorBuilder scriptFromClassPathFile(String fileName) { try { InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName); - this.script = IOUtils.toString(resourceAsStream); + this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset()); } catch (IOException e) { //wrap IOException because I prefer a runtime exception... throw new IllegalArgumentException(e); diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 42a6da905de3018f0863607a1cb2656ede884631..be3637692fed54939dd7e1ef8db3934a99331aea 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0