Skip to content

Commit

Permalink
Improve db check for duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
sypets committed Jan 21, 2025
1 parent 191e8c6 commit 37ab27e
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 8 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,7 @@ jobs:
- name: "Functional tests with postgres (nightly or pull_request)"
if: ${{ always() && (github.event_name == 'schedule' || github.event_name == 'pull_request' ) }}
run: Build/Scripts/runTests.sh -p ${{ matrix.php }} -d postgres -s functional

- name: "Functional tests with mysql (nightly or pull_request)"
if: ${{ always() && (github.event_name == 'schedule' || github.event_name == 'pull_request' ) }}
run: Build/Scripts/runTests.sh -p ${{ matrix.php }} -d mysql -s functional
39 changes: 31 additions & 8 deletions Classes/Command/UnduplicateCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,11 @@ protected function execute(InputInterface $input, OutputInterface $output): int
}

/**
* Uses GROUP BY BINARY identifier,storage to make sure we don't get results for identifiers which are only duplicate
* if checked case-insensitively.
*
* Database may be case-insensitive, e.g. charset 'utf8mb5', collation 'utf8mb4_unicode_ci'.
*
* @param mixed $onlyThisIdentifier
* @param int $onlyThisStorage
* @return Result
Expand All @@ -203,9 +208,7 @@ public function findDuplicates(mixed $onlyThisIdentifier, int $onlyThisStorage):
{
$queryBuilder = $this->connectionPool->getQueryBuilderForTable('sys_file');
$queryBuilder->count('*')
->addSelect('identifier', 'storage')
->from('sys_file')
->groupBy('identifier', 'storage')
->having('COUNT(*) > 1');
$whereExpressions = [];
if ($onlyThisIdentifier) {
Expand All @@ -223,27 +226,47 @@ public function findDuplicates(mixed $onlyThisIdentifier, int $onlyThisStorage):
if ($whereExpressions) {
$queryBuilder->where(...$whereExpressions);
}

$concreteQueryBuilder = $queryBuilder->getConcreteQueryBuilder();

// GROUP BY BINARY `identifier`,`storage
$concreteQueryBuilder->groupBy('BINARY ' . $queryBuilder->quoteIdentifier('identifier'));
$concreteQueryBuilder->addGroupBy($queryBuilder->quoteIdentifier('storage'));
// SELECT MAX(`identifier`) AS identifier,`storage`
$concreteQueryBuilder->addSelect('MAX(' . $queryBuilder->quoteIdentifier('identifier')
. ') AS identifier, ' . $queryBuilder->quoteIdentifier('storage'));

$this->output->writeln('sql=' . $queryBuilder->getSQL(), OutputInterface::VERBOSITY_VERBOSE);

$statement = $queryBuilder
->executeQuery();
return $statement;
}

/**
* Must make sure we compare identifier case-sensitively, so using "BINARY identifier" here .
*
* Database may be case-insensitive, e.g. charset 'utf8mb5', collation 'utf8mb4_unicode_ci'.
*/
private function findDuplicateFilesForIdentifier(string $identifier, int $storage): array
{
$fileQueryBuilder = $this->connectionPool->getQueryBuilderForTable('sys_file');

return $fileQueryBuilder->select('uid', 'identifier')
$fileQueryBuilder->select('uid', 'identifier')
->from('sys_file')
->where(
$fileQueryBuilder->expr()->eq(
'identifier',
$fileQueryBuilder->createNamedParameter($identifier, \PDO::PARAM_STR)
),
$fileQueryBuilder->expr()->eq(
'storage',
$fileQueryBuilder->createNamedParameter($storage, Connection::PARAM_INT)
)
)->orderBy('uid', 'DESC')->executeQuery()
)->orderBy('uid', 'DESC');

$whereClause = $fileQueryBuilder->expr()->eq('identifier',
$fileQueryBuilder->createNamedParameter($identifier, \PDO::PARAM_STR));
$whereClause = 'BINARY ' . $whereClause;
$fileQueryBuilder->add('where', $whereClause);

return $fileQueryBuilder->executeQuery()
->fetchAllAssociative();
}

Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,23 @@ Finds and fixes duplicates of sys_file entries pointing to the same file. Merges
Tested successfully with TYPO3 v12.

## Warning

Older versions for TYPO3 v8 may not consider identifiers with mixed case or sys_file
entries on several storages (sys_file.storage) correctly, see issue https://github.com/ElementareTeilchen/unduplicator/issues/2

## Portabilty (database)

In order to test for duplicates, a database command like this is used:

```sql
SELECT COUNT(*), MAX(identifier) AS identifier, storage FROM `sys_file` GROUP BY BINARY identifier, storage HAVING COUNT(*) > 1;
```

Therefore, it is necessary, that the underlying database engines support MAX and BINARY. This command was tested with the following:

* MariaDB
* MySQL

## Usage
We strongly recommend to run the **reference index update** (before and after):
If not run before or the references are out of date, some references may be overlooked and a sys_file entry deleted which has references.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"sys_file",,,
,"uid","identifier","storage"
,1,"/test/ABC.jpg",1
,2,"/test/ABC.jpg",1
,3,"/test/abc.jpg",1
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"sys_file",,,
,"uid","identifier","storage"
,2,"/test/ABC.jpg",1
,3,"/test/abc.jpg",1
16 changes: 16 additions & 0 deletions Tests/Functional/Command/UnduplicateCommandTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,22 @@ class UnduplicateCommandTest extends FunctionalTestCase
self::assertEquals(0, $result['status']);
}

/**
* * abc.jpg
* * ABC.jpg
* * ABC.jpg
* should remove one ABC.jpg
*/
#[Test] public function unduplicateCommandFixesDuplicatesWithMixCaseSensitives(): void
{
$this->importCSVDataSet(__DIR__ . '/DataSet/sys_file_duplicates_mix_casesensitive.csv');

$result = $this->executeConsoleCommand(self::BASE_COMMAND);

$this->assertCSVDataSet(__DIR__ . '/DataSet/sys_file_duplicates_mix_casesensitive_RESULT.csv');
self::assertEquals(0, $result['status']);
}

#[Test] public function unduplicateCommandFixesDuplicatesWithReferences(): void
{
$this->importCSVDataSet(__DIR__ . '/DataSet/sys_file_duplicates_with_references.csv');
Expand Down

0 comments on commit 37ab27e

Please sign in to comment.