diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 609e921..6f78b6e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,3 +41,7 @@ jobs: - name: "Functional tests with postgres (nightly or pull_request)" if: ${{ always() && (github.event_name == 'schedule' || github.event_name == 'pull_request' ) }} run: Build/Scripts/runTests.sh -p ${{ matrix.php }} -d postgres -s functional + + - name: "Functional tests with mysql (nightly or pull_request)" + if: ${{ always() && (github.event_name == 'schedule' || github.event_name == 'pull_request' ) }} + run: Build/Scripts/runTests.sh -p ${{ matrix.php }} -d mysql -s functional diff --git a/Classes/Command/UnduplicateCommand.php b/Classes/Command/UnduplicateCommand.php index 92d7859..c8e3336 100644 --- a/Classes/Command/UnduplicateCommand.php +++ b/Classes/Command/UnduplicateCommand.php @@ -195,6 +195,11 @@ protected function execute(InputInterface $input, OutputInterface $output): int } /** + * Uses GROUP BY BINARY identifier,storage to make sure we don't get results for identifiers which are only duplicate + * if checked case-insensitively. + * + * Database may be case-insensitive, e.g. charset 'utf8mb5', collation 'utf8mb4_unicode_ci'. + * * @param mixed $onlyThisIdentifier * @param int $onlyThisStorage * @return Result @@ -203,9 +208,7 @@ public function findDuplicates(mixed $onlyThisIdentifier, int $onlyThisStorage): { $queryBuilder = $this->connectionPool->getQueryBuilderForTable('sys_file'); $queryBuilder->count('*') - ->addSelect('identifier', 'storage') ->from('sys_file') - ->groupBy('identifier', 'storage') ->having('COUNT(*) > 1'); $whereExpressions = []; if ($onlyThisIdentifier) { @@ -223,27 +226,47 @@ public function findDuplicates(mixed $onlyThisIdentifier, int $onlyThisStorage): if ($whereExpressions) { $queryBuilder->where(...$whereExpressions); } + + $concreteQueryBuilder = $queryBuilder->getConcreteQueryBuilder(); + + // GROUP BY BINARY `identifier`,`storage + $concreteQueryBuilder->groupBy('BINARY ' . $queryBuilder->quoteIdentifier('identifier')); + $concreteQueryBuilder->addGroupBy($queryBuilder->quoteIdentifier('storage')); + // SELECT MAX(`identifier`) AS identifier,`storage` + $concreteQueryBuilder->addSelect('MAX(' . $queryBuilder->quoteIdentifier('identifier') + . ') AS identifier, ' . $queryBuilder->quoteIdentifier('storage')); + + $this->output->writeln('sql=' . $queryBuilder->getSQL(), OutputInterface::VERBOSITY_VERBOSE); + $statement = $queryBuilder ->executeQuery(); return $statement; } + /** + * Must make sure we compare identifier case-sensitively, so using "BINARY identifier" here . + * + * Database may be case-insensitive, e.g. charset 'utf8mb5', collation 'utf8mb4_unicode_ci'. + */ private function findDuplicateFilesForIdentifier(string $identifier, int $storage): array { $fileQueryBuilder = $this->connectionPool->getQueryBuilderForTable('sys_file'); - return $fileQueryBuilder->select('uid', 'identifier') + $fileQueryBuilder->select('uid', 'identifier') ->from('sys_file') ->where( - $fileQueryBuilder->expr()->eq( - 'identifier', - $fileQueryBuilder->createNamedParameter($identifier, \PDO::PARAM_STR) - ), $fileQueryBuilder->expr()->eq( 'storage', $fileQueryBuilder->createNamedParameter($storage, Connection::PARAM_INT) ) - )->orderBy('uid', 'DESC')->executeQuery() + )->orderBy('uid', 'DESC'); + + $whereClause = $fileQueryBuilder->expr()->eq('identifier', + $fileQueryBuilder->createNamedParameter($identifier, \PDO::PARAM_STR)); + $whereClause = 'BINARY ' . $whereClause; + $fileQueryBuilder->add('where', $whereClause); + + return $fileQueryBuilder->executeQuery() ->fetchAllAssociative(); } diff --git a/README.md b/README.md index 913f481..e624a72 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,23 @@ Finds and fixes duplicates of sys_file entries pointing to the same file. Merges Tested successfully with TYPO3 v12. ## Warning + Older versions for TYPO3 v8 may not consider identifiers with mixed case or sys_file entries on several storages (sys_file.storage) correctly, see issue https://github.com/ElementareTeilchen/unduplicator/issues/2 +## Portabilty (database) + +In order to test for duplicates, a database command like this is used: + +```sql +SELECT COUNT(*), MAX(identifier) AS identifier, storage FROM `sys_file` GROUP BY BINARY identifier, storage HAVING COUNT(*) > 1; +``` + +Therefore, it is necessary, that the underlying database engines support MAX and BINARY. This command was tested with the following: + +* MariaDB +* MySQL + ## Usage We strongly recommend to run the **reference index update** (before and after): If not run before or the references are out of date, some references may be overlooked and a sys_file entry deleted which has references. diff --git a/Tests/Functional/Command/DataSet/sys_file_duplicates_mix_casesensitive.csv b/Tests/Functional/Command/DataSet/sys_file_duplicates_mix_casesensitive.csv new file mode 100644 index 0000000..4e71b38 --- /dev/null +++ b/Tests/Functional/Command/DataSet/sys_file_duplicates_mix_casesensitive.csv @@ -0,0 +1,5 @@ +"sys_file",,, +,"uid","identifier","storage" +,1,"/test/ABC.jpg",1 +,2,"/test/ABC.jpg",1 +,3,"/test/abc.jpg",1 diff --git a/Tests/Functional/Command/DataSet/sys_file_duplicates_mix_casesensitive_RESULT.csv b/Tests/Functional/Command/DataSet/sys_file_duplicates_mix_casesensitive_RESULT.csv new file mode 100644 index 0000000..22650a8 --- /dev/null +++ b/Tests/Functional/Command/DataSet/sys_file_duplicates_mix_casesensitive_RESULT.csv @@ -0,0 +1,4 @@ +"sys_file",,, +,"uid","identifier","storage" +,2,"/test/ABC.jpg",1 +,3,"/test/abc.jpg",1 diff --git a/Tests/Functional/Command/UnduplicateCommandTest.php b/Tests/Functional/Command/UnduplicateCommandTest.php index 2127b6a..aa4a371 100644 --- a/Tests/Functional/Command/UnduplicateCommandTest.php +++ b/Tests/Functional/Command/UnduplicateCommandTest.php @@ -50,6 +50,22 @@ class UnduplicateCommandTest extends FunctionalTestCase self::assertEquals(0, $result['status']); } + /** + * * abc.jpg + * * ABC.jpg + * * ABC.jpg + * should remove one ABC.jpg + */ + #[Test] public function unduplicateCommandFixesDuplicatesWithMixCaseSensitives(): void + { + $this->importCSVDataSet(__DIR__ . '/DataSet/sys_file_duplicates_mix_casesensitive.csv'); + + $result = $this->executeConsoleCommand(self::BASE_COMMAND); + + $this->assertCSVDataSet(__DIR__ . '/DataSet/sys_file_duplicates_mix_casesensitive_RESULT.csv'); + self::assertEquals(0, $result['status']); + } + #[Test] public function unduplicateCommandFixesDuplicatesWithReferences(): void { $this->importCSVDataSet(__DIR__ . '/DataSet/sys_file_duplicates_with_references.csv');