diff --git a/README.md b/README.md index 750a76841..89536c927 100644 --- a/README.md +++ b/README.md @@ -118,9 +118,9 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/)) There are more examples in `webmagic-samples` package. -### Lisence: +### License: -Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0) +Licensed under [Apache 2.0 license](http://opensource.org/licenses/Apache-2.0) ### Thanks: diff --git a/pom.xml b/pom.xml index cf7d81612..5f1bdf901 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.9.1 + 0.10.0 4.0.0 pom diff --git a/src/site/site.xml b/src/site/site.xml index d2d5caacd..b78651960 100644 --- a/src/site/site.xml +++ b/src/site/site.xml @@ -5,7 +5,7 @@ org.apache.maven.skins maven-fluido-skin - 1.9 + 1.11.1 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 983d309b1..021a83f3e 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.1 + 0.10.0 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 6370171df..17f8b03dd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -49,15 +49,34 @@ public class Page { private byte[] bytes; - private List targetRequests = new ArrayList(); + private List targetRequests = new ArrayList<>(); private String charset; public Page() { } - public static Page fail(){ + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}. + * + * @return the page. + * @deprecated Use {@link #fail(Request)} instead. + */ + @Deprecated + public static Page fail() { + return fail(null); + } + + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}, + * and {@link #request} is specified. + * + * @return the page. + * @since 0.10.0 + */ + public static Page fail(Request request){ Page page = new Page(); + page.setRequest(request); page.setDownloadSuccess(false); return page; } @@ -123,13 +142,7 @@ public List getTargetRequests() { * @param requests requests */ public void addTargetRequests(Iterable requests) { - for (String s : requests) { - if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { - continue; - } - s = UrlUtils.canonicalizeUrl(s, url.toString()); - targetRequests.add(new Request(s)); - } + addTargetRequests(requests, 0); // Default priority is 0 } /** @@ -139,13 +152,32 @@ public void addTargetRequests(Iterable requests) { * @param priority priority */ public void addTargetRequests(Iterable requests, long priority) { - for (String s : requests) { - if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { - continue; - } - s = UrlUtils.canonicalizeUrl(s, url.toString()); - targetRequests.add(new Request(s).setPriority(priority)); + if(requests == null) { + return; + } + + for (String req : requests) { + addRequestIfValid(req, priority); + } + } + + /** + * Helper method to add a request if it's valid. + * + * @param url URL to add + * @param priority Priority for the URL + */ + private void addRequestIfValid(String url, long priority) { + if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) { + return; + } + + String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString()); + Request req = new Request(canonicalizedUrl); + if(priority > 0) { + req.setPriority(priority); } + targetRequests.add(req); } /** diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index ea3bbc590..6a400e321 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -36,26 +36,62 @@ public Html download(String url, String charset) { return (Html) page.getHtml(); } + /** + * @param request the {@link Request}. + * @deprecated Use {@link #onSuccess(Page, Task)} instead. + */ @Deprecated protected void onSuccess(Request request) { } /** + * @param request the {@link Request}. + * @param task the {@link Task}. * @since 0.7.6 + * @deprecated Use {@link #onSuccess(Page, Task)} instead. */ + @Deprecated protected void onSuccess(Request request, Task task) { this.onSuccess(request); } + /** + * @param page the {@link Page}. + * @param task the {@link Task}. + * @since 0.10.0 + */ + protected void onSuccess(Page page, Task task) { + this.onSuccess(page.getRequest(), task); + } + + /** + * @param request the {@link Request}. + * @deprecated Use {@link #onError(Page, Task, Throwable)} instead. + */ @Deprecated protected void onError(Request request) { } /** + * @param request the {@link Request}. + * @param task the {@link Task}. + * @param e the exception. * @since 0.7.6 + * @deprecated Use {@link #onError(Page, Task, Throwable)} instead. */ + @Deprecated protected void onError(Request request, Task task, Throwable e) { this.onError(request); } + /** + * @param page the {@link Page}. + * @param task the {@link Task}. + * @param e the exception. + * @since 0.10.0 + */ + protected void onError(Page page, Task task, Throwable e) { + this.onError(page.getRequest(), task, e); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 2f3ef58ed..80e7b72c9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -79,18 +79,18 @@ public Page download(Request request, Task task) { CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); - Page page = Page.fail(); + Page page = Page.fail(request); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); - onSuccess(request, task); + onSuccess(page, task); logger.info("downloading page success {}", request.getUrl()); return page; } catch (IOException e) { - onError(request, task, e); + onError(page, task, e); logger.info("download page {} error", request.getUrl(), e); return page; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java index 8775af108..1fb35f1a8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java @@ -3,6 +3,7 @@ import java.util.ArrayList; import java.util.List; + import org.apache.commons.collections4.CollectionUtils; /** @@ -55,11 +56,12 @@ public Selectable jsonPath(String jsonPath) { @Override public String get() { - if (CollectionUtils.isNotEmpty(all())) { - return all().get(0); - } else { - return null; - } + List sourceTexts = all(); + if (CollectionUtils.isNotEmpty(sourceTexts)) { + return sourceTexts.get(0); + } + return null; + } @Override @@ -91,8 +93,9 @@ public Selectable replace(String regex, String replacement) { } public String getFirstSourceText() { - if (getSourceTexts() != null && getSourceTexts().size() > 0) { - return getSourceTexts().get(0); + List sourceTexts = getSourceTexts(); + if (CollectionUtils.isNotEmpty(sourceTexts)) { + return sourceTexts.get(0); } return null; } @@ -104,6 +107,6 @@ public String toString() { @Override public boolean match() { - return getSourceTexts() != null && getSourceTexts().size() > 0; + return CollectionUtils.isNotEmpty(getSourceTexts()); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java index 55e185105..fbeb8ed3b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java @@ -6,12 +6,6 @@ public abstract class NumberUtils { public static int compareLong(long o1, long o2) { - if (o1 < o2) { - return -1; - } else if (o1 == o2) { - return 0; - } else { - return 1; - } + return Long.compare(o1, o2); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java index 23e1644ce..a2ca5afd0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java @@ -21,10 +21,10 @@ public static Set newHashSet(T... t){ } public static List newArrayList(T... t){ - List set = new ArrayList(t.length); + List list = new ArrayList(t.length); for (T t1 : t) { - set.add(t1); + list.add(t1); } - return set; + return list; } } diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 21fa00128..4109c49fc 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.9.1 + 0.10.0 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 008d00443..b47ae3614 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.1 + 0.10.0 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 4f1eee8e6..31dfca75a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -88,7 +88,7 @@ public Page download(Request request, Task task) { logger.info("downloading page: " + request.getUrl()); } - Page page = Page.fail(); + Page page = Page.fail(request); try { String content = getPage(request); if (!content.contains("HTTP request failed")) { @@ -98,9 +98,9 @@ public Page download(Request request, Task task) { page.setRequest(request); page.setStatusCode(200); } - onSuccess(request, task); + onSuccess(page, task); } catch (Exception e) { - onError(request, task, e); + onError(page, task, e); logger.warn("download page {} error", request.getUrl(), e); } return page; diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 783a5e9ea..08e70c161 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1 + 0.10.0 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 6982bc22e..4a2b358d0 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1 + 0.10.0 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 30984e39d..92914655a 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1 + 0.10.0 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 489bbbc95..5c2e50b2a 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1 + 0.10.0 4.0.0 diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index 39b3bc914..874f8aef7 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -74,7 +74,7 @@ public SeleniumDownloader setSleepTime(int sleepTime) { public Page download(Request request, Task task) { checkInit(); WebDriver webDriver = null; - Page page = Page.fail(); + Page page = Page.fail(request); try { webDriver = webDriverPool.get(); @@ -111,10 +111,10 @@ public Page download(Request request, Task task) { page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); - onSuccess(request, task); + onSuccess(page, task); } catch (Exception e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request, task, e); + onError(page, task, e); } finally { if (webDriver != null) { webDriverPool.returnToPool(webDriver);