Skip to content

Commit be561d1

Browse files
committed
Added a source filter to pre-process files to eliminate unnecessary scanning of exact replica of files.
A SourceFilter class was added which goal is to compare two source trees and filter out any duplicate source code files. It does so by hashing each file content and comparing the before and after hash sets. The intersection set is removed from the before and after list of files since there is no point to scan files with the exact same source code. The benefits of this a important both in term of cpu usage and memory usage since we do not parse code that will in the end reveal itself not to have changed. For example, here are the before and after of this change, comparing Symfony 2.5.9 with 2.6.3. Before [Symfony 2.5.9] 2936 php files in src [Symfony 2.6.3] 3078 php files in src 2564 files are exactly the same After [Symfony 2.5.9] 367 scanned and parsed (12.5% of the original amount) [Symfony 2.6.3] 509 scanned and parsed (16.5% of the original amount) But most important of all, it can compare the whole source trees in a reasonable amount of time, something it couldn't before.
1 parent d55a2f8 commit be561d1

File tree

2 files changed

+38
-0
lines changed

2 files changed

+38
-0
lines changed

src/PHPSemVerChecker/Console/Command/CompareCommand.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
use File_Iterator_Facade;
66
use PHPSemVerChecker\Analyzer\Analyzer;
7+
use PHPSemVerChecker\Filter\SourceFilter;
78
use PHPSemVerChecker\Reporter\Reporter;
89
use PHPSemVerChecker\Scanner\Scanner;
910
use Symfony\Component\Console\Command\Command;
@@ -43,6 +44,14 @@ protected function execute(InputInterface $input, OutputInterface $output)
4344

4445
$progress = new ProgressBar($output, count($sourceBefore) + count($sourceAfter));
4546
$progress->setFormat("%message%\n%current%/%max% [%bar%] %percent:3s%% %elapsed:6s%/%estimated:-6s% %memory:6s%");
47+
$progress->setMessage('Pre-processing before/after files');
48+
$progress->start();
49+
50+
$sourceFilter = new SourceFilter();
51+
$identicalCount = $sourceFilter->filter($sourceBefore, $sourceAfter);
52+
53+
$progress->start(count($sourceBefore) + count($sourceAfter));
54+
4655
$progress->setMessage('Scanning before files');
4756
foreach ($sourceBefore as $file) {
4857
$scannerBefore->scan($file);
@@ -68,6 +77,7 @@ protected function execute(InputInterface $input, OutputInterface $output)
6877

6978
$duration = microtime(true) - $startTime;
7079
$output->writeln('');
80+
$output->writeln('[Scanned files] Before: ' . count($sourceBefore) . ', After: ' . count($sourceAfter) . ', Identical: ' . $identicalCount);
7181
$output->writeln('Time: ' . round($duration, 3) . ' seconds, Memory: ' . round(memory_get_peak_usage() / 1024 / 1024, 3) . ' MB');
7282
}
7383
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
<?php
2+
3+
namespace PHPSemVerChecker\Filter;
4+
5+
class SourceFilter
6+
{
7+
/**
8+
* @param array $filesBefore
9+
* @param array $filesAfter
10+
*/
11+
public function filter(array &$filesBefore, array &$filesAfter)
12+
{
13+
$hashedBefore = [];
14+
foreach ($filesBefore as $fileBefore) {
15+
$hashedBefore[sha1(file_get_contents($fileBefore))] = $fileBefore;
16+
}
17+
18+
$hashedAfter = [];
19+
foreach ($filesAfter as $fileAfter) {
20+
$hashedAfter[sha1(file_get_contents($fileAfter))] = $fileAfter;
21+
}
22+
23+
$intersection = array_intersect_key($hashedBefore, $hashedAfter);
24+
$filesBefore = array_values(array_diff_key($hashedBefore, $intersection));
25+
$filesAfter = array_values(array_diff_key($hashedAfter, $intersection));
26+
return count($intersection);
27+
}
28+
}

0 commit comments

Comments
 (0)