Skip to content

Commit 295f8c1

Browse files
authored
Merge pull request #17 from BenMorel/stars-query-string
Implement wildcard, end-of-string, query string
2 parents 351c96d + cb7b8c7 commit 295f8c1

File tree

5 files changed

+116
-11
lines changed

5 files changed

+116
-11
lines changed

src/RobotsTxt.php

Lines changed: 71 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,34 +34,90 @@ public static function create(string $source): self
3434

3535
public function allows(string $url, ?string $userAgent = '*'): bool
3636
{
37-
$path = parse_url($url, PHP_URL_PATH) ?? '';
37+
$requestUri = '';
38+
39+
$parts = parse_url($url);
40+
41+
if ($parts !== false) {
42+
if (isset($parts['path'])) {
43+
$requestUri .= $parts['path'];
44+
}
45+
46+
if (isset($parts['query'])) {
47+
$requestUri .= '?'.$parts['query'];
48+
} elseif ($this->hasEmptyQueryString($url)) {
49+
$requestUri .= '?';
50+
}
51+
}
3852

3953
$disallows = $this->disallowsPerUserAgent[$userAgent] ?? $this->disallowsPerUserAgent['*'] ?? [];
4054

41-
return ! $this->pathIsDenied($path, $disallows);
55+
return ! $this->pathIsDenied($requestUri, $disallows);
4256
}
4357

44-
protected function pathIsDenied(string $path, array $disallows): bool
58+
protected function pathIsDenied(string $requestUri, array $disallows): bool
4559
{
4660
foreach ($disallows as $disallow) {
47-
$trimmedDisallow = rtrim($disallow, '/');
61+
if ($disallow === '') {
62+
continue;
63+
}
4864

49-
if (in_array($path, [$disallow, $trimmedDisallow])) {
50-
return true;
65+
$stopAtEndOfString = false;
66+
67+
if ($disallow[-1] === '$') {
68+
// if the pattern ends with a dollar sign, the string must end there
69+
$disallow = substr($disallow, 0, -1);
70+
$stopAtEndOfString = true;
5171
}
5272

53-
if (! $this->concernsDirectory($disallow)) {
54-
continue;
73+
// convert to regexp
74+
$disallowRegexp = preg_quote($disallow, '/');
75+
76+
// the pattern must start at the beginning of the string...
77+
$disallowRegexp = '^'.$disallowRegexp;
78+
79+
// ...and optionally stop at the end of the string
80+
if ($stopAtEndOfString) {
81+
$disallowRegexp .= '$';
5582
}
5683

57-
if ($this->isUrlInDirectory($path, $disallow)) {
84+
// replace (preg_quote'd) stars with an eager match
85+
$disallowRegexp = str_replace('\\*', '.*', $disallowRegexp);
86+
87+
// enclose in delimiters
88+
$disallowRegexp = '/'.$disallowRegexp.'/';
89+
90+
if (preg_match($disallowRegexp, $requestUri) === 1) {
5891
return true;
5992
}
6093
}
6194

6295
return false;
6396
}
6497

98+
/**
99+
* Checks for an empty query string.
100+
*
101+
* This works around the fact that parse_url() will not set the 'query' key when the query string is empty.
102+
* See: https://bugs.php.net/bug.php?id=78385
103+
*/
104+
protected function hasEmptyQueryString(string $url) : bool
105+
{
106+
if ($url === '') {
107+
return false;
108+
}
109+
110+
if ($url[-1] === '?') { // ends with ?
111+
return true;
112+
}
113+
114+
if (strpos($url, '?#') !== false) { // empty query string, followed by a fragment
115+
return true;
116+
}
117+
118+
return false;
119+
}
120+
65121
protected function getDisallowsPerUserAgent(string $content): array
66122
{
67123
$lines = explode(PHP_EOL, $content);
@@ -117,11 +173,17 @@ protected function parseDisallow(string $line): string
117173
return trim(substr_replace(strtolower(trim($line)), '', 0, 8), ': ');
118174
}
119175

176+
/**
177+
* @deprecated
178+
*/
120179
protected function concernsDirectory(string $path): bool
121180
{
122181
return substr($path, strlen($path) - 1, 1) === '/';
123182
}
124183

184+
/**
185+
* @deprecated
186+
*/
125187
protected function isUrlInDirectory(string $url, string $path): bool
126188
{
127189
return strpos($url, $path) === 0;

tests/RobotsTest.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ public function it_can_discover_default_robots_file()
5555

5656
$robots = Robots::create();
5757

58-
$this->assertFalse($robots->mayIndex($this->getLocalTestServerUrl('/nl/admin')));
58+
$this->assertTrue($robots->mayIndex($this->getLocalTestServerUrl('/nl/admin')));
5959

6060
$this->assertFalse($robots->mayIndex($this->getLocalTestServerUrl('/nl/admin/')));
6161

tests/RobotsTxtTest.php

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,4 +77,37 @@ public function it_can_handle_an_empty_robots_txt()
7777

7878
$this->assertTrue($robots->allows('/'));
7979
}
80+
81+
/** @test */
82+
public function it_can_handle_star_in_pattern()
83+
{
84+
$robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt');
85+
86+
$this->assertTrue($robots->allows('/en/admin'));
87+
$this->assertFalse($robots->allows('/en/admin/'));
88+
$this->assertFalse($robots->allows('/en/admin/users'));
89+
}
90+
91+
/** @test */
92+
public function it_can_handle_dollar_in_pattern()
93+
{
94+
$robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt');
95+
96+
$this->assertTrue($robots->allows('/fr/ad'));
97+
$this->assertFalse($robots->allows('/fr/admin'));
98+
$this->assertTrue($robots->allows('/fr/admin/'));
99+
$this->assertTrue($robots->allows('/fr/admin?'));
100+
$this->assertTrue($robots->allows('/fr/admin?test'));
101+
}
102+
103+
/** @test */
104+
public function it_can_handle_query_strings()
105+
{
106+
$robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt');
107+
108+
$this->assertTrue($robots->allows('/en/admin'));
109+
$this->assertTrue($robots->allows('/en/admin?id=123'));
110+
$this->assertFalse($robots->allows('/en/admin?print'));
111+
$this->assertFalse($robots->allows('/en/admin?print=true'));
112+
}
80113
}

tests/data/robots.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22

33
User-agent: *
44

5+
Disallow: /*?print
56
Disallow: /nl/admin/
6-
Disallow: /en/admin/
7+
Disallow: /en/admin/*
8+
Disallow: /fr/admin$
79
Disallow: /es/admin-disallow/
810
User-agent: google
911

tests/server/server.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,14 @@ app.get('/nl', function (req, res) {
4949
res.end();
5050
});
5151

52+
app.get('/nl/admin', function (req, res) {
53+
console.log('Request at /nl/admin');
54+
55+
res.writeHead(200);
56+
57+
res.end();
58+
});
59+
5260
var server = app.listen(4020, function () {
5361
var host = 'localhost';
5462
var port = server.address().port;

0 commit comments

Comments
 (0)