diff --git a/README.md b/README.md index 7d71c57..a2facd1 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The parser currently supports: - User-agent: - Allow: -- Disallow: +- Disallow (with explicit mode support): - Sitemap: - Crawl-delay: - Host: @@ -41,6 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [ robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true +robots.isExplicitlyDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // false robots.getCrawlDelay('Sams-Bot/1.0'); // 1 robots.getSitemaps(); // ['http://example.com/sitemap.xml'] robots.getPreferredHost(); // example.com @@ -62,6 +63,13 @@ Returns true if crawling the specified URL is not allowed for the specified user This will return `undefined` if the URL isn't valid for this robots.txt. +### isExplicitlyDisallowed(url, ua) + +**boolean or undefined** + +Returns trues if explicitly disallowed for the specified user agent (User Agent wildcards are discarded). + +This will return undefined if the URL is not valid for this robots.txt file. ### getMatchingLineNumber(url, [ua]) **number or undefined** diff --git a/Robots.js b/Robots.js index f0a8e9e..9fb7cf5 100644 --- a/Robots.js +++ b/Robots.js @@ -361,7 +361,7 @@ Robots.prototype.setPreferredHost = function (url) { this._preferredHost = url; }; -Robots.prototype._getRule = function (url, ua) { +Robots.prototype._getRule = function (url, ua, explicit) { var parsedUrl = parseUrl(url) || {}; var userAgent = formatUserAgent(ua || '*'); @@ -374,7 +374,12 @@ Robots.prototype._getRule = function (url, ua) { return; } - var rules = this._rules[userAgent] || this._rules['*'] || []; + var rules = this._rules[userAgent]; + if (!explicit) { + rules = rules || this._rules['*'] + } + rules = rules || [] + var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search); var rule = findRule(path, rules); @@ -392,7 +397,7 @@ Robots.prototype._getRule = function (url, ua) { * @return {boolean?} */ Robots.prototype.isAllowed = function (url, ua) { - var rule = this._getRule(url, ua); + var rule = this._getRule(url, ua, false); if (typeof rule === 'undefined') { return; @@ -416,7 +421,7 @@ Robots.prototype.isAllowed = function (url, ua) { * @return {number?} */ Robots.prototype.getMatchingLineNumber = function (url, ua) { - var rule = this._getRule(url, ua); + var rule = this._getRule(url, ua, false); return rule ? rule.lineNumber : -1; }; @@ -425,13 +430,30 @@ Robots.prototype.getMatchingLineNumber = function (url, ua) { * Returns the opposite of isAllowed() * * @param {string} url - * @param {string} ua + * @param {string?} ua * @return {boolean} */ Robots.prototype.isDisallowed = function (url, ua) { return !this.isAllowed(url, ua); }; +/** + * Returns trues if explicitly disallowed + * for the specified user agent (User Agent wildcards are discarded). + * + * This will return undefined if the URL is not valid for this robots.txt file. + * @param {string} url + * @param {string} ua + * @return {boolean?} + */ +Robots.prototype.isExplicitlyDisallowed = function(url, ua) { + var rule = this._getRule(url, ua, true); + if (typeof rule === 'undefined') { + return true; + } + return !(!rule || rule.allow); +} + /** * Gets the crawl delay if there is one. * diff --git a/index.d.ts b/index.d.ts index 5446898..0cf4313 100644 --- a/index.d.ts +++ b/index.d.ts @@ -3,6 +3,7 @@ declare module 'robots-parser'; interface Robot { isAllowed(url: string, ua?: string): boolean | undefined; isDisallowed(url: string, ua?: string): boolean | undefined; + isExplicitlyDisallowed(url: string, ua: string): boolean | undefined; getMatchingLineNumber(url: string, ua?: string): number; getCrawlDelay(ua?: string): number | undefined; getSitemaps(): string[]; diff --git a/test/Robots.js b/test/Robots.js index 666d9b8..f1575ae 100644 --- a/test/Robots.js +++ b/test/Robots.js @@ -861,4 +861,30 @@ describe('Robots', function () { testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed); }); + + it('should not be disallowed when wildcard is used in explicit mode', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /', + ].join('\n') + + var url = 'https://www.example.com/hello' + var userAgent = 'SomeBot'; + var robots = robotsParser(url, contents); + + expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false) + }) + + it('should be disallowed when user agent equal robots rule in explicit mode', function () { + var contents = [ + 'User-agent: SomeBot', + 'Disallow: /', + ].join('\n') + + var url = 'https://www.example.com/hello' + var userAgent = 'SomeBot'; + var robots = robotsParser(url, contents); + + expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true) + }) });