@@ -34,34 +34,90 @@ public static function create(string $source): self
3434
3535 public function allows (string $ url , ?string $ userAgent = '* ' ): bool
3636 {
37- $ path = parse_url ($ url , PHP_URL_PATH ) ?? '' ;
37+ $ requestUri = '' ;
38+
39+ $ parts = parse_url ($ url );
40+
41+ if ($ parts !== false ) {
42+ if (isset ($ parts ['path ' ])) {
43+ $ requestUri .= $ parts ['path ' ];
44+ }
45+
46+ if (isset ($ parts ['query ' ])) {
47+ $ requestUri .= '? ' .$ parts ['query ' ];
48+ } elseif ($ this ->hasEmptyQueryString ($ url )) {
49+ $ requestUri .= '? ' ;
50+ }
51+ }
3852
3953 $ disallows = $ this ->disallowsPerUserAgent [$ userAgent ] ?? $ this ->disallowsPerUserAgent ['* ' ] ?? [];
4054
41- return ! $ this ->pathIsDenied ($ path , $ disallows );
55+ return ! $ this ->pathIsDenied ($ requestUri , $ disallows );
4256 }
4357
44- protected function pathIsDenied (string $ path , array $ disallows ): bool
58+ protected function pathIsDenied (string $ requestUri , array $ disallows ): bool
4559 {
4660 foreach ($ disallows as $ disallow ) {
47- $ trimmedDisallow = rtrim ($ disallow , '/ ' );
61+ if ($ disallow === '' ) {
62+ continue ;
63+ }
4864
49- if (in_array ($ path , [$ disallow , $ trimmedDisallow ])) {
50- return true ;
65+ $ stopAtEndOfString = false ;
66+
67+ if ($ disallow [-1 ] === '$ ' ) {
68+ // if the pattern ends with a dollar sign, the string must end there
69+ $ disallow = substr ($ disallow , 0 , -1 );
70+ $ stopAtEndOfString = true ;
5171 }
5272
53- if (! $ this ->concernsDirectory ($ disallow )) {
54- continue ;
73+ // convert to regexp
74+ $ disallowRegexp = preg_quote ($ disallow , '/ ' );
75+
76+ // the pattern must start at the beginning of the string...
77+ $ disallowRegexp = '^ ' .$ disallowRegexp ;
78+
79+ // ...and optionally stop at the end of the string
80+ if ($ stopAtEndOfString ) {
81+ $ disallowRegexp .= '$ ' ;
5582 }
5683
57- if ($ this ->isUrlInDirectory ($ path , $ disallow )) {
84+ // replace (preg_quote'd) stars with an eager match
85+ $ disallowRegexp = str_replace ('\\* ' , '.* ' , $ disallowRegexp );
86+
87+ // enclose in delimiters
88+ $ disallowRegexp = '/ ' .$ disallowRegexp .'/ ' ;
89+
90+ if (preg_match ($ disallowRegexp , $ requestUri ) === 1 ) {
5891 return true ;
5992 }
6093 }
6194
6295 return false ;
6396 }
6497
98+ /**
99+ * Checks for an empty query string.
100+ *
101+ * This works around the fact that parse_url() will not set the 'query' key when the query string is empty.
102+ * See: https://bugs.php.net/bug.php?id=78385
103+ */
104+ protected function hasEmptyQueryString (string $ url ) : bool
105+ {
106+ if ($ url === '' ) {
107+ return false ;
108+ }
109+
110+ if ($ url [-1 ] === '? ' ) { // ends with ?
111+ return true ;
112+ }
113+
114+ if (strpos ($ url , '?# ' ) !== false ) { // empty query string, followed by a fragment
115+ return true ;
116+ }
117+
118+ return false ;
119+ }
120+
65121 protected function getDisallowsPerUserAgent (string $ content ): array
66122 {
67123 $ lines = explode (PHP_EOL , $ content );
@@ -117,11 +173,17 @@ protected function parseDisallow(string $line): string
117173 return trim (substr_replace (strtolower (trim ($ line )), '' , 0 , 8 ), ': ' );
118174 }
119175
176+ /**
177+ * @deprecated
178+ */
120179 protected function concernsDirectory (string $ path ): bool
121180 {
122181 return substr ($ path , strlen ($ path ) - 1 , 1 ) === '/ ' ;
123182 }
124183
184+ /**
185+ * @deprecated
186+ */
125187 protected function isUrlInDirectory (string $ url , string $ path ): bool
126188 {
127189 return strpos ($ url , $ path ) === 0 ;
0 commit comments