Skip to content

Commit

Permalink
Tweaks to fingerprinting and logging
Browse files Browse the repository at this point in the history
  • Loading branch information
BlythMeister committed Jul 23, 2020
1 parent 91bddef commit fe0957e
Show file tree
Hide file tree
Showing 7 changed files with 277 additions and 234 deletions.
144 changes: 76 additions & 68 deletions src/BingImageDownload/BingInteractionAndParsing.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
using SixLabors.ImageSharp;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Net;
using System.Threading;
using System.Xml.Linq;

namespace BingImageDownload
Expand All @@ -15,108 +15,107 @@ internal class BingInteractionAndParsing
private const string Url = "https://bing.com";

private readonly ConsoleWriter consoleWriter;
private readonly ImageHashing imageHashing;
private readonly ImageFingerprinting imageFingerprinting;
private readonly ImagePropertyHandling imagePropertyHandling;
private readonly Paths paths;
private readonly Serializer serializer;
private readonly List<string> urlsRetrieved;
private readonly List<CultureInfo> countries;
private readonly string urlsRetrievedBinFile;

public BingInteractionAndParsing(ConsoleWriter consoleWriter, ImageHashing imageHashing, ImagePropertyHandling imagePropertyHandling, Paths paths, Serializer serializer)
public BingInteractionAndParsing(ConsoleWriter consoleWriter, ImageFingerprinting imageFingerprinting, ImagePropertyHandling imagePropertyHandling, Paths paths, Serializer serializer)
{
this.consoleWriter = consoleWriter;
this.imageHashing = imageHashing;
this.imageFingerprinting = imageFingerprinting;
this.imagePropertyHandling = imagePropertyHandling;
this.paths = paths;
this.serializer = serializer;
urlsRetrievedBinFile = Path.Combine(paths.AppData, "urlsRetrieved.bin");

urlsRetrieved = serializer.Deserialize<List<string>>(urlsRetrievedBinFile).ToList();
countries = CultureInfo.GetCultures(CultureTypes.AllCultures).Where(x => x.Name.Contains("-")).ToList();

consoleWriter.WriteLine($"Have loaded {urlsRetrieved.Count} previous URLs");
consoleWriter.WriteLine($"Have loaded {countries.Count} countries");
}

internal void GetBingImages(CancellationToken cancellationToken)
internal (int countryDownloadedImages, int countryDuplicateImages, int countrySeenUrls) GetBingImages(CultureInfo country)
{
var downloadedImages = 0;

foreach (var country in countries)
consoleWriter.WriteLine($"Searching for images for {country.Name} - {country.DisplayName}");
var stopwatch = new Stopwatch();
stopwatch.Start();

var countryDownloadedImages = 0;
var countryDuplicateImages = 0;
var countrySeenUrls = 0;
var moreImages = true;
var datePairs = new Dictionary<(string start, string end), XElement>();
while (moreImages)
{
if (cancellationToken.IsCancellationRequested)
var imageNodes = GetImages(datePairs.Count, country.Name);

if (!imageNodes.Any())
{
return;
moreImages = false;
}

consoleWriter.WriteLine($"Searching for images for {country.Name} - {country.DisplayName}");
var countryImages = 0;
var countryDuplicateImages = 0;
var moreImages = true;
var datePairs = new Dictionary<(string start, string end), XElement>();
while (moreImages)
else
{
var imageNodes = GetImages(datePairs.Count, country.Name);

if (!imageNodes.Any())
foreach (var imageNode in imageNodes)
{
moreImages = false;
}
else
{
foreach (var imageNode in imageNodes)
{
var startDate = imageNode.Element("startdate")?.Value;
var endDate = imageNode.Element("enddate")?.Value;

if (datePairs.Any(x => x.Key.start == startDate && x.Key.end == endDate))
{
moreImages = false;
continue;
}
var startDate = imageNode.Element("startdate")?.Value;
var endDate = imageNode.Element("enddate")?.Value;

datePairs.Add((startDate, endDate), imageNode);
if (datePairs.Any(x => x.Key.start == startDate && x.Key.end == endDate))
{
moreImages = false;
continue;
}

datePairs.Add((startDate, endDate), imageNode);
}
}
}

foreach (var ((startDate, endDate), imageNode) in datePairs.OrderBy(x => x.Key.start))
foreach (var ((startDate, endDate), imageNode) in datePairs.OrderBy(x => x.Key.start))
{
var imageUrl = $"{Url}{imageNode.Element("urlBase")?.Value}_1920x1080.jpg";
consoleWriter.WriteLine(1, $"Image for: '{country.Name}' on {startDate}-{endDate} was: {imageUrl}");
try
{
var imageUrl = $"{Url}{imageNode.Element("urlBase")?.Value}_1920x1080.jpg";
consoleWriter.WriteLine(1, $"Image for: '{country.Name}' on {startDate}-{endDate} was: {imageUrl}");
try
var result = DownloadAndSaveImage(imageNode, imageUrl);
switch (result)
{
if (DownloadAndSaveImage(imageNode, imageUrl))
{
countryImages++;
}
else
{
case DownloadResult.SeenUrl:
countrySeenUrls++;
break;

case DownloadResult.DuplicateImage:
countryDuplicateImages++;
}
}
catch (Exception ex)
{
consoleWriter.WriteLine("There was an error getting image", ex);
break;

case DownloadResult.NewImage:
countryDownloadedImages++;
break;
}
}

downloadedImages += countryImages;
consoleWriter.WriteLine($"Found {countryImages} new images for {country.Name}");
consoleWriter.WriteLine($"Found {countryDuplicateImages} duplicate images for {country.Name}");
consoleWriter.WriteLine("");
catch (Exception ex)
{
consoleWriter.WriteLine("There was an error getting image", ex);
}
}

consoleWriter.WriteLine($"Found {downloadedImages} new images");
consoleWriter.WriteLine($"Found {countryDownloadedImages} new images for {country.Name}");
consoleWriter.WriteLine($"Found {countryDuplicateImages} duplicate images for {country.Name}");
consoleWriter.WriteLine($"Found {countrySeenUrls} urls already downloaded for {country.Name}");
consoleWriter.WriteLine($"Duration {stopwatch.Elapsed.TotalSeconds} seconds for {country.Name}");
consoleWriter.WriteLine("");

return (countryDownloadedImages, countryDuplicateImages, countrySeenUrls);
}

internal bool DownloadAndSaveImage(XElement imageNode, string imageUrl)
private DownloadResult DownloadAndSaveImage(XElement imageNode, string imageUrl)
{
if (urlsRetrieved.Contains(imageUrl))
{
consoleWriter.WriteLine(2, "Already Downloaded Image URL");
return false;
return DownloadResult.SeenUrl;
}

var tempFilename = Path.Combine(paths.DownloadPath, Guid.NewGuid() + ".jpg");
Expand All @@ -129,18 +128,18 @@ internal bool DownloadAndSaveImage(XElement imageNode, string imageUrl)
catch (Exception e)
{
consoleWriter.WriteLine(2, $"Error downloading image from url: {imageUrl}", e);
return false;
return DownloadResult.Error;
}

consoleWriter.WriteLine(2, "Downloaded Image, Checking If Duplicate");
var newImage = false;
var haveIdenticalImage = imageHashing.HaveIdenticalImageInHashTable(tempFilename);
var haveIdenticalImage = imageFingerprinting.HaveIdenticalImageInFingerprints(tempFilename);

if (!haveIdenticalImage)
{
var filePath = Path.Combine(paths.SavePath, GetFileName(imageUrl));
var counter = 0;
while (imageHashing.HaveFileNameInHashTable(filePath))
while (imageFingerprinting.HaveFileNameInFingerprints(filePath))
{
counter++;
filePath = Path.Combine(paths.SavePath, GetFileName(imageUrl, counter));
Expand All @@ -153,20 +152,21 @@ internal bool DownloadAndSaveImage(XElement imageNode, string imageUrl)
imagePropertyHandling.SetImageExifTags(imageNode, srcImg);
srcImg.Save(filePath);
}
imageHashing.AddHash(filePath);
imageFingerprinting.AddFingerprint(filePath);
}
else
{
consoleWriter.WriteLine(3, "Identical Image Downloaded");
}

urlsRetrieved.Add(imageUrl);

SaveUrlBin();
File.Delete(tempFilename);
return newImage;
return newImage ? DownloadResult.NewImage : DownloadResult.DuplicateImage;
}

internal string GetFileName(string imageUrl, int counter = 0)
private string GetFileName(string imageUrl, int counter = 0)
{
var name = imageUrl.Substring(7 + Url.Length);
if (name.Contains("_"))
Expand All @@ -184,7 +184,7 @@ internal string GetFileName(string imageUrl, int counter = 0)
return Path.GetInvalidFileNameChars().Aggregate(name, (current, invalidChar) => current.Replace(invalidChar, '-'));
}

internal List<XElement> GetImages(int currentIndex, string country)
private List<XElement> GetImages(int currentIndex, string country)
{
var urlToLoad = $"{Url}/HPImageArchive.aspx?format=xml&idx={currentIndex}&n=5&mkt={country}";

Expand Down Expand Up @@ -220,5 +220,13 @@ private void SaveUrlBin()
{
serializer.Serialize(urlsRetrieved, urlsRetrievedBinFile);
}

private enum DownloadResult
{
Error,
SeenUrl,
DuplicateImage,
NewImage
}
}
}
16 changes: 14 additions & 2 deletions src/BingImageDownload/ConsoleWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,15 @@ internal class ConsoleWriter
internal ConsoleWriter(Paths paths)
{
Console.OutputEncoding = Encoding.UTF8;
logWriter = new StreamWriter(Path.Combine(paths.LogPath, $"Log-{DateTime.UtcNow:yyyy-MM-dd}.txt"), false, Encoding.UTF8);
var logPath = Path.Combine(paths.LogPath, $"Log {DateTime.UtcNow:yyyy-MM-dd}.txt");
var counter = 0;
while (File.Exists(logPath))
{
counter++;
logPath = Path.Combine(paths.LogPath, $"Log {DateTime.UtcNow:yyyy-MM-dd} ({counter}).txt");
}

logWriter = new StreamWriter(logPath, false, Encoding.UTF8);

if (tempBuilder.Length > 0)
{
Expand Down Expand Up @@ -66,7 +74,11 @@ internal void WriteToFile(string textLine)

if (tempBuilder.Length > 0)
{
logWriter.Write(tempBuilder.ToString());
lock (lockThis)
{
logWriter.Write(tempBuilder.ToString());
}

tempBuilder.Clear();
}

Expand Down
Loading

0 comments on commit fe0957e

Please sign in to comment.