Skip to content

Commit

Permalink
utils
Browse files Browse the repository at this point in the history
  • Loading branch information
e3rd committed Mar 12, 2024
1 parent 3417258 commit 42f1a50
Show file tree
Hide file tree
Showing 5 changed files with 257 additions and 221 deletions.
51 changes: 49 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,14 +104,18 @@ mv -n /home/user/duplicates/bar.txt /home/user/duplicates/✓bar.txt
mv -n /home/user/duplicates/third.txt /home/user/duplicates/✓third.txt
```

# Documentation – `Deduplidog` class
# Documentation

## Parameters

Import the `Deduplidog` class and change its parameters.

```python3
from deduplidog import Deduplidog
```

Or change these parameter from CLI or TUI, by launching `deduplidog`.

Find the duplicates. Normally, the file must have the same size, date and name. (Name might be just similar if parameters like strip_end_counter are set.) If media_magic=True, media files receive different rules: Neither the size nor the date are compared. See its help.

| parameter | type | default | description |
Expand Down Expand Up @@ -141,4 +145,47 @@ Find the duplicates. Normally, the file must have the same size, date and name.
| media_magic | bool | False | Nor the size or date is compared for files with media suffixes.<br>A video is considered a duplicate if it has the same name and a similar number of frames, even if it has a different extension.<br>An image is considered a duplicate if it has the same name and a similar image hash, even if the files are of different sizes.<br>(This mode is considerably slower.) |
| accepted_frame_delta | int | 1 | Used only when media_magic is True |
| accepted_img_hash_diff | int | 1 | Used only when media_magic is True |
| img_compare_date | bool | False | If True and media_magic=True, the file date or the EXIF date must match. |
| img_compare_date | bool | False | If True and media_magic=True, the file date or the EXIF date must match. |

## Utils
In the `deduplidog.utils` packages, you'll find a several handsome tools to help you. You will find parameters by using you IDE hints.

### `images`
*`urls: Iterable[str | Path]`* Display a ribbon of images.

### `print_video_thumbs`
*`src: str | Path`* Displays thumbnails for a video.

### `print_videos_thumbs`
*`dir_: Path`* To quickly understand the content of each video, output the duration and the first few frames.

### `get_frame_count`
*`filename: str|Path`* Uses cv2 to determine the video frame count. Method is cached.

### `search_for_media_wizzard`
*`cwd: str`* Repeatedly prompt and search for files with similar names somewhere in the specified path. Display all such files as images and video previews.

### `are_contained`
*`work_dir: str, original_dir: str, sec_range: int = 60`* You got two dirs with files having different naming system (427.JPG vs DSC_1344)
which you suspect to contain the same set. The same files in the dirs seem to have the same timestamp.
The same timestamp means +/- sec_range (ex: 1 minute).
Loop all files from work_dir and display corresponding files having the same timestamp.
or warn that no original exists.

### `remove_prefix_in_workdir`
*`work_dir: str`* Removes the prefix ✓ recursively from all the files. The prefix might have been previously given by the deduplidog.


### `mark_symlink_by_target`
*`suspicious_directory: str | Path, starting_path: str`* If the file is a symlink, pointing to this path, rename it with an arrow.

```
:param suspicious_directory: Ex: /media/user/disk/Takeout/Photos/
:param starting_path: Ex: /media/user/disk
```

### `mark_symlink_only_dirs`
*`dir_: str | Path`* If the directory is full of only symlinks or empty, rename it to an arrow.

### `mtime_files_in_dir_according_to_json`
*`dir_: str | Path, json_dir: str | Path`* Google Photos returns JSON with the photo modification time. Sets the photos from the dir_ to the dates fetched from the directory with these JSONs.
11 changes: 9 additions & 2 deletions deduplidog/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,15 @@ def cli(dd: Deduplidog):
return dd


if __name__ == "__main__":
def main():
global INPUTS

# CLI
try:
dd = cli()
if input("Continue? [Y/n] ").casefold() not in ("", "y"):
if not dd: # maybe just --help
return
if input("See more options? [Y/n] ").casefold() not in ("", "y"):
sys.exit()
except click.MissingParameter:
# User launched the program without parameters.
Expand Down Expand Up @@ -110,3 +114,6 @@ def cli(dd: Deduplidog):
continue
if input("See more options? [Y/n] ").casefold() not in ("y", ""):
break

if __name__ == "__main__":
main()
220 changes: 4 additions & 216 deletions deduplidog/deduplidog.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
import logging
import os
import re
Expand All @@ -7,24 +6,19 @@
from dataclasses import dataclass
from datetime import datetime
from functools import cache
from itertools import chain
from pathlib import Path
from time import sleep
from typing import Annotated, get_args, get_type_hints
from zlib import crc32

import click
import cv2
import imagehash
from dataclass_click import option
from humanize import naturaldelta, naturalsize
from IPython.display import Image, clear_output, display
from ipywidgets import HBox, widgets
from PIL import ExifTags, Image
from sh import find
from tqdm.autonotebook import tqdm

from .interface_utils import Field
from .utils import _qp, crc, get_frame_count

VIDEO_SUFFIXES = ".mp4", ".mov", ".avi", ".vob", ".mts", ".3gp", ".mpg", ".mpeg", ".wmv"
IMAGE_SUFFIXES = ".jpg", ".jpeg", ".png", ".gif"
Expand Down Expand Up @@ -528,9 +522,9 @@ def _find_similar(self, work_file: Path, candidates: list[Path]):
for original in candidates:
ost, wst = original.stat(), work_file.stat()
if (self.ignore_date
or wst.st_mtime == ost.st_mtime
or self.tolerate_hour and self.tolerate_hour[0] <= (wst.st_mtime - ost.st_mtime)/3600 <= self.tolerate_hour[1]
) and (self.ignore_size or wst.st_size == ost.st_size and (not self.checksum or crc(original) == crc(work_file))):
or wst.st_mtime == ost.st_mtime
or self.tolerate_hour and self.tolerate_hour[0] <= (wst.st_mtime - ost.st_mtime)/3600 <= self.tolerate_hour[1]
) and (self.ignore_size or wst.st_size == ost.st_size and (not self.checksum or crc(original) == crc(work_file))):
return original

def _find_similar_media(self, work_file: Path, comparing_image: bool, candidates: list[Path]):
Expand Down Expand Up @@ -609,209 +603,3 @@ def _print_change(self, change: Change):
for text, changes in zip((f" {wicon}{self.work_dir_name}:",
f" {oicon}{self.original_dir_name}:"), change.values()) if len(changes)]


@cache
def crc(path: Path):
""" Surprisingly, sha256 and sha1 was faster than md5 when using hashlib.file_digest. However crc32 is still the fastest."""
crc = 0
with path.open('rb') as f:
while True:
chunk = f.read(4096)
if not chunk:
break
crc = crc32(chunk, crc)
return crc


def _qp(path: Path):
"""Quoted path. Output path to be used in bash. I wonder there is no system method which covers
quotes in the path etc.
"""
s = str(path)
return f'"{s}"' if " " in s else s


# TODO: below are some functions that should be converted into documented utils or removed

def remove_prefix_in_workdir(work_dir: str):
""" Removes the prefix ✓ recursively from all the files.
The prefix might have been previously given by the deduplidog. """
work_files = [f for f in tqdm(Path(work_dir).rglob("*"), desc="Caching working files") if f.is_file()]
for p in work_files:
p.rename(p.with_stem(p.stem.removeprefix("✓")))


@cache
def get_frame_count(filename):
video = cv2.VideoCapture(str(filename))
# duration = video.get(cv2.CAP_PROP_POS_MSEC)
frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT)
return frame_count


def mark_symlink_by_target(suspicious_directory: str | Path, starting_path):
""" If the file is a symlink, pointing to this path, rename it with an arrow
:param suspicious_directory: Ex: /media/user/disk/Takeout/Photos/
:param starting_path: Ex: /media/user/disk
"""
for f in (x for x in Path(suspicious_directory).rglob("*") if x.is_symlink()):
if str(f.resolve()).startswith(starting_path):
print(f.rename(f.with_name("→" + f.name)))
print(f)

# Opakovane vyhledavat, zde se soubory podobneho jmena naleza nekde v dane ceste.
# Zobrazit jako obrazky a nahledy videi vsechny takove soubory.


def _stub():
while True:
a = input()
clear_output()
cwd = "/media/user/disk1/Photos/"
print("Searching", a, "in", cwd)
files = find("-iname", f"*{a}*", _cwd=cwd)
files = [Path(cwd, f.strip()) for f in files]
print("Len", len(files))
images(files)
[print_video_thumbs(f) for f in files]


def _are_similar(original: Path, work_file: Path, accepted_img_hash_diff: int = 1):
original_pil = Image.open(original)
work_pil = Image.open(work_file)
hash0 = imagehash.average_hash(original_pil)
hash1 = imagehash.average_hash(work_pil)
# maximum bits that could be different between the hashes
return abs(hash0 - hash1) <= accepted_img_hash_diff


def are_contained(work_dir, original_dir, sec_range: int = 60):
""" You got two dirs with files having different naming system (427.JPG vs DSC_1344)
which you suspect to contain the same set. The same files in the dirs seem to have the same timestamp.
The same timestamp means +/- sec_range (ex: 1 minute).
Loop all files from work_dir and display corresponding files having the same timestamp.
or warn that no original exists-
"""

# build directory of originals
global originals, found
originals = defaultdict(set) # [timestamp] = set(originals...)
for of in Path(original_dir).rglob("*"):
originals[of.stat().st_mtime].add(of)

found = {}
for wf in (bar := tqdm(list(Path(work_dir).rglob("*")))):
bar.set_postfix({"file": str(wf.name), "found": len(found)})

timestamp = wf.stat().st_mtime
# 0, -1, 1, -2, 2 ... to find candidate earlier
range_ = sorted(range(-sec_range, sec_range+1), key=lambda x: abs(x))
corresponding = (originals.get(timestamp + i, set())
for i in range_) # find all originals with similar timestamps
# flatten the sets and unique them (but keep as list to preserve files with less timestamp difference first)
corresponding = list(dict.fromkeys(chain.from_iterable(corresponding)))

if corresponding:
for candidate in (bar2 := tqdm(corresponding, leave=False, desc="Candidates")):
bar2.set_postfix({"file": candidate.name})
if _are_similar(candidate, wf):
found[wf] = candidate
# tqdm would not dissappear if not finished https://github.com/tqdm/tqdm/issues/1382
bar2.update(float("inf"))
bar2.close()
break
else:
print("No candidate for", wf.name, corresponding)
images([wf] + list(corresponding))
else:
print("Missing originals for", wf.name)


# are_contained("/media/user/disk1/Photos/_tabor/2/", "/media/user/disk1/Photos/tabory/C 074 2016/")


def images(urls):
""" Display a ribbon of images """
images_ = []
for url in tqdm(urls, leave=False):
p = Path(url)
if p.exists():
images_.append(widgets.Image(width=150, value=p.read_bytes()))
else:
print("Fail", p)
display(HBox(images_))


def print_video_thumbs(src):
vidcap = cv2.VideoCapture(str(src))
success, image = vidcap.read()
count = 0
images = []
while success:
success, image = vidcap.read()
if count % 100 == 0:
try:
# images.append(Image(width=150, data=cv2.imencode('.jpg', image)[1]))
images.append(widgets.Image(width=150, value=cv2.imencode('.jpg', image)[1]))
except:
break
if count > 500:
break
count += 1
print(src, get_frame_count(src))
if images:
display(HBox(images))


def get_video_thumbs(dir_):
""" Abych rychle poznal, co v kterem videu je, vypsat delku a prvnich par screenu """
for f in sorted(Path(dir_).rglob("*")):
if f.suffix.lower() in (".mov", ".avi", ".mp4", ".vob"):
print_video_thumbs(f)


get_video_thumbs("/media/user/disk1/Photos/dram/")


def mark_symlink_only_dirs(dir_):
""" Pokud je adresar plny jen symlinku nebo prazdny, přijmenovat mu šipku """
for d in (x for x in Path(dir_).rglob("*") if x.is_dir()):
if all(x.is_symlink() for x in Path(d).glob("*")):
print(d.rename(d.with_name("→" + d.name)))


# mark_symlink_only_dirs("/media/user/disk2/Takeoutuser/Google Photos/")


def mark_01_copies(suspicious_directory):
for f in (x for x in Path(suspicious_directory).glob("*(1)*")):
stem = f.stem.removesuffix("(1)")

for x in (x for x in Path("/media/user/disk2/_duplikaty_smazat/").rglob("*") if x.stem.removeprefix("✓") == stem):
print(f.rename(f.with_name("→" + f.name)))
# mark_01_copies("/media/user/disk2/Takeoutuser/YouTube and YouTube Music/videos/")


def mtime_files_in_dir_according_to_json(dir_, json_dir):
""" google photos vrací json, kde je čas fotky
Kromě JPG.
"""
for photo in Path(dir_).rglob("*"):
# if photo.suffix.lower() in (".jpg", ".jpeg"):
# continue
# if "50607264_2240519186012556_9095104762705084416_o.jpg" not in photo.name:
# continue
metadata = Path(json_dir).joinpath(photo.name[:46] + ".json")
if metadata.exists():
# if photo.stat().st_mtime < 1654812000:
# zmenit jenom takove soubory, ktere uz nebyly zmeneny jinak,
# coz poznam tak, ze jejich datum je 10.6.2022
# continue
timestamp = json.loads(metadata.read_text())["photoTakenTime"]["timestamp"]
os.utime(photo, (int(timestamp), int(timestamp)))
print(photo)
# break

# mtime_files_in_dir_according_to_json("/media/user/disk2/Takeoutuser/Google Photos/Photos from 2019/",
# "/media/user/disk2/photos_json/")
Loading

0 comments on commit 42f1a50

Please sign in to comment.