Skip to content

Commit

Permalink
Add explicit disallow feature (#36)
Browse files Browse the repository at this point in the history
* add explicit disallow feature isExplictlyDisallowed()
  • Loading branch information
SimonC-Audigent authored Oct 28, 2024
1 parent 982657e commit f07168c
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 6 deletions.
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The parser currently supports:

- User-agent:
- Allow:
- Disallow:
- Disallow (with explicit mode support):
- Sitemap:
- Crawl-delay:
- Host:
Expand Down Expand Up @@ -41,6 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [
robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true
robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true
robots.isExplicitlyDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // false
robots.getCrawlDelay('Sams-Bot/1.0'); // 1
robots.getSitemaps(); // ['http://example.com/sitemap.xml']
robots.getPreferredHost(); // example.com
Expand All @@ -62,6 +63,13 @@ Returns true if crawling the specified URL is not allowed for the specified user

This will return `undefined` if the URL isn't valid for this robots.txt.

### isExplicitlyDisallowed(url, ua)

**boolean or undefined**

Returns trues if explicitly disallowed for the specified user agent (User Agent wildcards are discarded).

This will return undefined if the URL is not valid for this robots.txt file.
### getMatchingLineNumber(url, [ua])

**number or undefined**
Expand Down
32 changes: 27 additions & 5 deletions Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ Robots.prototype.setPreferredHost = function (url) {
this._preferredHost = url;
};

Robots.prototype._getRule = function (url, ua) {
Robots.prototype._getRule = function (url, ua, explicit) {
var parsedUrl = parseUrl(url) || {};
var userAgent = formatUserAgent(ua || '*');

Expand All @@ -374,7 +374,12 @@ Robots.prototype._getRule = function (url, ua) {
return;
}

var rules = this._rules[userAgent] || this._rules['*'] || [];
var rules = this._rules[userAgent];
if (!explicit) {
rules = rules || this._rules['*']
}
rules = rules || []

var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
var rule = findRule(path, rules);

Expand All @@ -392,7 +397,7 @@ Robots.prototype._getRule = function (url, ua) {
* @return {boolean?}
*/
Robots.prototype.isAllowed = function (url, ua) {
var rule = this._getRule(url, ua);
var rule = this._getRule(url, ua, false);

if (typeof rule === 'undefined') {
return;
Expand All @@ -416,7 +421,7 @@ Robots.prototype.isAllowed = function (url, ua) {
* @return {number?}
*/
Robots.prototype.getMatchingLineNumber = function (url, ua) {
var rule = this._getRule(url, ua);
var rule = this._getRule(url, ua, false);

return rule ? rule.lineNumber : -1;
};
Expand All @@ -425,13 +430,30 @@ Robots.prototype.getMatchingLineNumber = function (url, ua) {
* Returns the opposite of isAllowed()
*
* @param {string} url
* @param {string} ua
* @param {string?} ua
* @return {boolean}
*/
Robots.prototype.isDisallowed = function (url, ua) {
return !this.isAllowed(url, ua);
};

/**
* Returns trues if explicitly disallowed
* for the specified user agent (User Agent wildcards are discarded).
*
* This will return undefined if the URL is not valid for this robots.txt file.
* @param {string} url
* @param {string} ua
* @return {boolean?}
*/
Robots.prototype.isExplicitlyDisallowed = function(url, ua) {
var rule = this._getRule(url, ua, true);
if (typeof rule === 'undefined') {
return true;
}
return !(!rule || rule.allow);
}

/**
* Gets the crawl delay if there is one.
*
Expand Down
1 change: 1 addition & 0 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ declare module 'robots-parser';
interface Robot {
isAllowed(url: string, ua?: string): boolean | undefined;
isDisallowed(url: string, ua?: string): boolean | undefined;
isExplicitlyDisallowed(url: string, ua: string): boolean | undefined;
getMatchingLineNumber(url: string, ua?: string): number;
getCrawlDelay(ua?: string): number | undefined;
getSitemaps(): string[];
Expand Down
26 changes: 26 additions & 0 deletions test/Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -861,4 +861,30 @@ describe('Robots', function () {

testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed);
});

it('should not be disallowed when wildcard is used in explicit mode', function () {
var contents = [
'User-agent: *',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false)
})

it('should be disallowed when user agent equal robots rule in explicit mode', function () {
var contents = [
'User-agent: SomeBot',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true)
})
});

0 comments on commit f07168c

Please sign in to comment.