Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [3.4.1] - 2025-03-08
### Fixed
* Since, when using the Chrome browser for loading, we can only execute GET requests:
* The loader now automatically switches to the HTTP client for POST, PUT, PATCH, and DELETE requests and logs a warning.
* A warning is logged when attempting to use "Post Browser Navigate Hooks" with POST, PUT, PATCH, or DELETE requests.
* Consequently, the `useBrowser()` method, introduced in v3.4.0, is also limited to GET requests.

## [3.4.0] - 2025-03-06
### Added
* Two new methods to the base class of all `Http` steps:
Expand Down
30 changes: 27 additions & 3 deletions src/Steps/Loading/HttpBase.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\Utils\HttpHeaders;
use Exception;
use GuzzleHttp\Psr7\Request;
use InvalidArgumentException;
use Psr\Http\Message\RequestInterface;
Expand Down Expand Up @@ -140,6 +141,15 @@ public function useInputKeyAsHeaders(string $key): static

public function postBrowserNavigateHook(Closure $callback): static
{
if ($this->method !== 'GET') {
$this->logger?->warning(
'A ' . $this->method . ' request cannot be executed using the (headless) browser, so post browser ' .
'navigate hooks can\'t be defined for this step either.',
);

return $this;
}

$this->postBrowserNavigateHooks[] = $callback;

return $this;
Expand Down Expand Up @@ -255,13 +265,14 @@ protected function getResponseFromRequest(RequestInterface $request): ?Responded

/**
* @return array<string, mixed>
* @throws Exception
*/
private function applyTempLoaderCustomizations(): array
{
$resetConfig = ['resetToHttpClient' => false];

$loader = $this->getLoader();

$resetConfig = ['resetToHttpClient' => false, 'resetToBrowser' => false];

if (!empty($this->postBrowserNavigateHooks) && $loader->usesHeadlessBrowser()) {
$loader->browser()->setTempPostNavigateHooks($this->postBrowserNavigateHooks);
}
Expand All @@ -270,7 +281,18 @@ private function applyTempLoaderCustomizations(): array
$loader->skipCacheForNextRequest();
}

if ($this->forceBrowserUsage && !$loader->usesHeadlessBrowser()) {
if ($this->method !== 'GET' && ($this->forceBrowserUsage || $loader->usesHeadlessBrowser())) {
$this->logger?->warning(
'The (headless) browser can only be used for GET requests! Therefore this step will use the HTTP ' .
'client for loading.',
);

if ($loader->usesHeadlessBrowser()) {
$loader->useHttpClient();

$resetConfig['resetToBrowser'] = true;
}
} elseif ($this->forceBrowserUsage && !$loader->usesHeadlessBrowser()) {
$resetConfig['resetToHttpClient'] = true;

$loader->useHeadlessBrowser();
Expand All @@ -291,6 +313,8 @@ private function resetTempLoaderCustomizations(array $resetConfig): void
$loader->useHttpClient();
} catch (Throwable) {
}
} elseif ($resetConfig['resetToBrowser']) {
$loader->useHeadlessBrowser();
}
}

Expand Down
94 changes: 94 additions & 0 deletions tests/Steps/Loading/HttpTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Steps\Loading\Http\Browser\BrowserAction;
use Crwlr\Url\Url;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
Expand Down Expand Up @@ -93,6 +94,10 @@
return $request->getMethod() === $httpMethod;
})->once();

if ($httpMethod !== 'GET') {
$loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();
}

$step = (new Http($httpMethod))->setLoader($loader);

helper_traverseIterable($step->invokeStep(new Input('https://www.foo.bar/baz')));
Expand Down Expand Up @@ -140,6 +145,8 @@
return $request->getBody()->getContents() === $body;
})->once();

$loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();

$step = (new Http('PATCH', [], $body))->setLoader($loader);

helper_traverseIterable($step->invokeStep(new Input('https://github.com/')));
Expand All @@ -152,6 +159,8 @@
return $request->getProtocolVersion() === $httpVersion;
})->once();

$loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();

$step = (new Http('PATCH', [], 'body', $httpVersion))->setLoader($loader);

helper_traverseIterable($step->invokeStep(new Input('https://packagist.org/packages/crwlr/url')));
Expand All @@ -164,6 +173,10 @@
return $request->getMethod() === $httpMethod;
})->once();

if ($httpMethod !== 'GET') {
$loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();
}

$step = (Http::{strtolower($httpMethod)}())->setLoader($loader);

helper_traverseIterable($step->invokeStep(new Input('https://dev.to/otsch')));
Expand All @@ -178,6 +191,10 @@ function (string $httpMethod) {
return $request->getMethod() === $httpMethod;
})->once()->andReturn(new RespondedRequest(new Request('GET', '/foo'), new Response(200)));

if ($httpMethod !== 'GET') {
$loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();
}

$step = (Http::{strtolower($httpMethod)}())
->setLoader($loader)
->stopOnErrorResponse();
Expand Down Expand Up @@ -466,6 +483,19 @@ function () {
},
);

it('rejects post browser navigate hooks, when the HTTP method is not GET', function (string $httpMethod) {
$logger = new DummyLogger();

$step = (new Http($httpMethod))->addLogger($logger)->postBrowserNavigateHook(BrowserAction::wait(1.0));

expect($logger->messages)->toHaveCount(1)
->and($logger->messages[0]['message'])->toBe(
'A ' . $httpMethod . ' request cannot be executed using the (headless) browser, so post browser ' .
'navigate hooks can\'t be defined for this step either.',
)
->and(invade($step)->postBrowserNavigateHooks)->toBe([]);
})->with(['POST', 'PUT', 'PATCH', 'DELETE']);

it(
'calls the HttpLoader::skipCacheForNextRequest() method before calling load when the skipCache() method was called',
function () {
Expand Down Expand Up @@ -557,6 +587,70 @@ function () {
},
);

it(
'does not switch the loader to use the browser, when useBrowser() was called, the loader is configured to use ' .
'the HTTP client, but the request method is not GET',
function (string $httpMethod) {
$logger = new DummyLogger();

$loader = Mockery::mock(HttpLoader::class);

$loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(false);

$loader->shouldNotReceive('useHeadlessBrowser');

$respondedRequest = new RespondedRequest(
new Request($httpMethod, 'https://www.example.com/something'),
new Response(200, body: Utils::streamFor('Something!')),
);

$loader->shouldReceive('load')->once()->andReturn($respondedRequest);

$step = Http::{$httpMethod}()->setLoader($loader)->addLogger($logger)->useBrowser();

helper_invokeStepWithInput($step);

expect($logger->messages)->toHaveCount(1)
->and($logger->messages[0]['message'])->toBe(
'The (headless) browser can only be used for GET requests! Therefore this step will use the HTTP ' .
'client for loading.',
);
},
)->with(['post', 'put', 'patch', 'delete']);

it(
'automatically switches the loader to use the HTTP client, when the HTTP method is not GET and the loader is ' .
'configured to use the browser',
function (string $httpMethod) {
$logger = new DummyLogger();

$loader = Mockery::mock(HttpLoader::class);

$loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(true);

$loader->shouldReceive('useHttpClient')->once();

$loader->shouldReceive('useHeadlessBrowser')->once();

$respondedRequest = new RespondedRequest(
new Request($httpMethod, 'https://www.example.com/something'),
new Response(200, body: Utils::streamFor('Something!')),
);

$loader->shouldReceive('load')->once()->andReturn($respondedRequest);

$step = Http::{$httpMethod}()->setLoader($loader)->addLogger($logger)->useBrowser();

helper_invokeStepWithInput($step);

expect($logger->messages)->toHaveCount(1)
->and($logger->messages[0]['message'])->toBe(
'The (headless) browser can only be used for GET requests! Therefore this step will use the HTTP ' .
'client for loading.',
);
},
)->with(['post', 'put', 'patch', 'delete']);

it(
'switches back the loader to use the HTTP client, when stopOnErrorResponse() and useBrowser() was called and ' .
'loading throws an exception',
Expand Down