Skip to content

Commit

Permalink
Fix image difference matching algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
BlythMeister committed Jul 22, 2020
1 parent c334d9c commit 71fcc97
Show file tree
Hide file tree
Showing 7 changed files with 125 additions and 110 deletions.
123 changes: 57 additions & 66 deletions src/BingImageDownload/BingInteractionAndParsing.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
using System.Linq;
using System.Net;
using System.Threading;
using System.Xml;
using System.Xml.Linq;

namespace BingImageDownload
{
Expand Down Expand Up @@ -53,53 +53,52 @@ internal void GetBingImages(CancellationToken cancellationToken)
consoleWriter.WriteLine($"Searching for images for {country.Name} - {country.DisplayName}");
var countryImages = 0;
var countryDuplicateImages = 0;
var currentIndex = 0;
var moreImages = true;
var startDate = string.Empty;
var endDate = string.Empty;
var datePairs = new Dictionary<(string start, string end), XElement>();
while (moreImages)
{
var xmlNodeList = GetImages(currentIndex, country.Name);
var imageNodes = GetImages(datePairs.Count, country.Name);

if (xmlNodeList == null)
if (!imageNodes.Any())
{
moreImages = false;
}
else
{
foreach (XmlNode xmlNode in xmlNodeList)
foreach (var imageNode in imageNodes)
{
var nodeStartDate = xmlNode.SelectSingleNode("startdate")?.InnerText;
var nodeEndDate = xmlNode.SelectSingleNode("enddate")?.InnerText;
var startDate = imageNode.Element("startdate")?.Value;
var endDate = imageNode.Element("enddate")?.Value;

if (startDate == nodeStartDate && endDate == nodeEndDate)
if (datePairs.Any(x => x.Key.start == startDate && x.Key.end == endDate))
{
moreImages = false;
break;
continue;
}

startDate = nodeStartDate;
endDate = nodeEndDate;
var imageUrl = $"{Url}{xmlNode.SelectSingleNode("urlBase")?.InnerText}_1920x1080.jpg";
consoleWriter.WriteLine(1, $"Image for: '{country.Name}' on {startDate}-{endDate} index {currentIndex} was: {imageUrl}");
try
{
if (DownloadAndSaveImage(xmlNode))
{
countryImages++;
}
else
{
countryDuplicateImages++;
}
}
catch (Exception ex)
{
consoleWriter.WriteLine("There was an error getting image", ex);
}
datePairs.Add((startDate, endDate), imageNode);
}
}
}

currentIndex += 1;
foreach (var ((startDate, endDate), imageNode) in datePairs.OrderBy(x => x.Key.start))
{
var imageUrl = $"{Url}{imageNode.Element("urlBase")?.Value}_1920x1080.jpg";
consoleWriter.WriteLine(1, $"Image for: '{country.Name}' on {startDate}-{endDate} was: {imageUrl}");
try
{
if (DownloadAndSaveImage(imageNode, imageUrl))
{
countryImages++;
}
else
{
countryDuplicateImages++;
}
}
catch (Exception ex)
{
consoleWriter.WriteLine("There was an error getting image", ex);
}
}

Expand All @@ -112,28 +111,25 @@ internal void GetBingImages(CancellationToken cancellationToken)
consoleWriter.WriteLine($"Found {downloadedImages} new images");
}

internal bool DownloadAndSaveImage(XmlNode xmlNode)
internal bool DownloadAndSaveImage(XElement imageNode, string imageUrl)
{
var fileUrl = $"{Url}{xmlNode.SelectSingleNode("urlBase")?.InnerText}_1920x1080.jpg";
if (urlsRetrieved.Contains(fileUrl))
if (urlsRetrieved.Contains(imageUrl))
{
consoleWriter.WriteLine(2, "Already Downloaded Image URL");
return false;
}

var filePath = Path.Combine(paths.SavePath, GetFileName(xmlNode));
var filePath = Path.Combine(paths.SavePath, GetFileName(imageUrl));
var tempFilename = Path.Combine(paths.DownloadPath, Guid.NewGuid() + ".jpg");

try
{
using (var client = new WebClient())
{
client.DownloadFile(fileUrl, tempFilename);
}
using var client = new WebClient();
client.DownloadFile(imageUrl, tempFilename);
}
catch (Exception e)
{
consoleWriter.WriteLine(2, $"Error downloading image from url: {fileUrl}", e);
consoleWriter.WriteLine(2, $"Error downloading image from url: {imageUrl}", e);
return false;
}

Expand All @@ -145,7 +141,7 @@ internal bool DownloadAndSaveImage(XmlNode xmlNode)
consoleWriter.WriteLine(3, "Found New Image");
using (var srcImg = Image.Load(tempFilename))
{
imagePropertyHandling.SetTitleOnImage(xmlNode, srcImg);
imagePropertyHandling.SetImageExifTags(imageNode, srcImg);
srcImg.Save(filePath);
}
imageHashing.AddHash(filePath);
Expand All @@ -155,17 +151,15 @@ internal bool DownloadAndSaveImage(XmlNode xmlNode)
consoleWriter.WriteLine(3, "Identical Image Downloaded");
}

urlsRetrieved.Add(fileUrl);
urlsRetrieved.Add(imageUrl);
SaveUrlBin();
File.Delete(tempFilename);
return newImage;
}

internal string GetFileName(XmlNode xmlNode)
internal string GetFileName(string imageUrl)
{
var nameNode = xmlNode.SelectSingleNode("urlBase");
if (nameNode == null) throw new Exception("Missing urlBase Node");

var name = nameNode.InnerText.Substring(7);
var name = imageUrl.Substring(7 + Url.Length);
if (name.Contains("_"))
{
name = name.Substring(0, name.IndexOf("_", StringComparison.Ordinal));
Expand All @@ -181,33 +175,30 @@ internal string GetFileName(XmlNode xmlNode)
return Path.GetInvalidFileNameChars().Aggregate(name, (current, invalidChar) => current.Replace(invalidChar, '-'));
}

internal XmlNodeList GetImages(int currentIndex, string country)
internal List<XElement> GetImages(int currentIndex, string country)
{
var urlToLoad = $"{Url}/HPImageArchive.aspx?format=xml&idx={currentIndex}&n=1&mkt={country}";
var urlToLoad = $"{Url}/HPImageArchive.aspx?format=xml&idx={currentIndex}&n=5&mkt={country}";

try
{
using (var client = new WebClient())
using var client = new WebClient();
var output = client.DownloadString(urlToLoad);
if (output.Length > 0 && output.Contains("<images>"))
{
var output = client.DownloadString(urlToLoad);
if (output.Length > 0 && output.Contains("<images>"))
try
{
try
{
var xmlDocument = new XmlDocument();
xmlDocument.LoadXml(output);
var xDocument = XDocument.Parse(output);

return xmlDocument.GetElementsByTagName("image");
}
catch (Exception e)
{
consoleWriter.WriteLine("Error getting images from XML response", e);
return null;
}
return xDocument.Descendants("image").ToList();
}
catch (Exception e)
{
consoleWriter.WriteLine("Error getting images from XML response", e);
return null;
}

return null;
}

return null;
}
catch (Exception e)
{
Expand All @@ -216,7 +207,7 @@ internal XmlNodeList GetImages(int currentIndex, string country)
}
}

internal void SaveUrlBin()
private void SaveUrlBin()
{
serializer.Serialize(urlsRetrieved, urlsRetrievedBinFile);
}
Expand Down
29 changes: 21 additions & 8 deletions src/BingImageDownload/HistogramHash.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
Expand All @@ -7,32 +8,44 @@ namespace BingImageDownload
public class HistogramHash
{
public string FileName { get; }
public List<RgbaPixelData> Rgba { get; }
public List<RgbPixelData> Rgb { get; }

public HistogramHash(string fileName, List<RgbaPixelData> rgba)
public HistogramHash(string fileName, List<RgbPixelData> rgb)
{
FileName = fileName;
Rgba = rgba;
Rgb = rgb;
}

internal bool IsInvalid(Paths paths)
{
if (string.IsNullOrWhiteSpace(FileName)) return true;
if (!File.Exists(Path.Combine(paths.SavePath, FileName)) && !File.Exists(Path.Combine(paths.ArchivePath, FileName))) return true;
if (Rgba == null || !Rgba.Any()) return true;
if (Rgb == null || !Rgb.Any()) return true;
return false;
}

internal bool Equal(HistogramHash other)
{
foreach (var val in Rgba)
var differencesOverTolerance = 0f;

foreach (var val in Rgb)
{
var otherVal = other.Rgba.FirstOrDefault(x => x.X.Equals(val.X) && x.Y.Equals(val.Y));
var otherVal = other.Rgb.FirstOrDefault(x => x.X.Equals(val.X) && x.Y.Equals(val.Y));
if (otherVal == null) return false;
if (!val.RgbaValue.Equals(otherVal.RgbaValue)) return false;

var differenceR = Math.Abs(val.R - otherVal.R);
var differenceG = Math.Abs(val.G - otherVal.G);
var differenceB = Math.Abs(val.B - otherVal.B);

if (differenceR > 3 || differenceG > 3 || differenceB > 3)
{
differencesOverTolerance++;
}
}

return true;
var differencePercent = differencesOverTolerance / Rgb.Count * 100;

return differencePercent < 1;
}
}
}
31 changes: 19 additions & 12 deletions src/BingImageDownload/ImageHashing.cs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ internal bool ImageInHash(string tempFilename, string realFileName)
{
if (HaveFilePathInHashTable(realFileName)) return true;

var testHash = GetRgbaHistogramHash(tempFilename);
var testHash = GetRgbHistogramHash(tempFilename);
return histogramHashTable.Any(hash => hash.Equal(testHash));
}

Expand All @@ -55,14 +55,19 @@ internal bool HaveFilePathInHashTable(string filePath)
return histogramHashTable.Any(x => x.FileName.Equals(fileName, StringComparison.InvariantCultureIgnoreCase));
}

internal void AddHash(string filePath)
internal void AddHash(string filePath, bool saveHashTable = true)
{
if (HaveFilePathInHashTable(filePath)) return;

histogramHashTable.Add(GetRgbaHistogramHash(filePath));
histogramHashTable.Add(GetRgbHistogramHash(filePath));

if (saveHashTable)
{
SaveHashTableBin();
}
}

internal void SaveHashTableBin()
private void SaveHashTableBin()
{
serializer.Serialize(histogramHashTable, histogramBinFile);
}
Expand All @@ -81,13 +86,13 @@ private void HashExistingImages(int retryCount = 0)
foreach (var file in Directory.GetFiles(paths.SavePath, "*.jpg").Where(x => !HaveFilePathInHashTable(x)))
{
consoleWriter.WriteLine($"Hashing file: {file}");
AddHash(file);
AddHash(file, false);
}

foreach (var file in Directory.GetFiles(paths.ArchivePath, "*.jpg").Where(x => !HaveFilePathInHashTable(x)))
{
consoleWriter.WriteLine($"Hashing file: {file}");
AddHash(file);
AddHash(file, false);
}
}
catch (Exception)
Expand All @@ -103,30 +108,32 @@ private void HashExistingImages(int retryCount = 0)
}
}

private HistogramHash GetRgbaHistogramHash(string filePath)
private HistogramHash GetRgbHistogramHash(string filePath)
{
var histogramFile = Path.Combine(paths.HistogramPath, Guid.NewGuid() + ".jpg");
File.Copy(filePath, histogramFile);
var rgba = new List<RgbaPixelData>();
var Rgb = new List<RgbPixelData>();
var fileName = Path.GetFileName(filePath);

using (var image = Image.Load<Rgba32>(histogramFile))
using (var image = Image.Load<Rgb24>(histogramFile))
{
image.Mutate(x => x.Resize(new Size(32)));
//Scale down from 1920*1080 to 48*27 - this will pixelate but enough to tell differences.
//This means 1296 total pixels rather than 2073600.
image.Mutate(x => x.Resize(48, 27).Grayscale());

for (var x = 0; x < image.Width; x++)
{
for (var y = 0; y < image.Height; y++)
{
var pixel = image[x, y];
rgba.Add(new RgbaPixelData(x, y, pixel.Rgba));
Rgb.Add(new RgbPixelData(x, y, pixel.R, pixel.G, pixel.B));
}
}
}

File.Delete(histogramFile);

return new HistogramHash(fileName, rgba);
return new HistogramHash(fileName, Rgb);
}
}
}
10 changes: 5 additions & 5 deletions src/BingImageDownload/ImagePropertyHandling.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
using SixLabors.ImageSharp.Metadata.Profiles.Exif;
using System;
using System.Text;
using System.Xml;
using System.Xml.Linq;

namespace BingImageDownload
{
internal class ImagePropertyHandling
{
internal void SetTitleOnImage(XmlNode xmlNode, Image image)
internal void SetImageExifTags(XElement imageNode, Image image)
{
var copyright = xmlNode.SelectSingleNode("copyright")?.InnerText;
var copyright = imageNode.Element("copyright")?.Value;
var title = copyright;
var author = string.Empty;
var headline = xmlNode.SelectSingleNode("headline")?.InnerText;
var headline = imageNode.Element("headline")?.Value;

if (copyright != null && copyright.Contains("©"))
{
Expand All @@ -33,7 +33,7 @@ void SetPropertyItemString(ExifTag<byte[]> tag, string value)

SetPropertyItemString(ExifTag.XPTitle, title);
SetPropertyItemString(ExifTag.XPAuthor, author);
SetPropertyItemString(ExifTag.XPComment, $"Bing Image '{headline}' For {xmlNode.SelectSingleNode("startdate")?.InnerText}-{xmlNode.SelectSingleNode("enddate")?.InnerText}");
SetPropertyItemString(ExifTag.XPComment, $"Bing Image '{headline}' For {imageNode.Element("startdate")?.Value}-{imageNode.Element("enddate")?.Value}");
SetPropertyItemString(ExifTag.XPKeywords, DateTime.Now.ToShortDateString());
}
}
Expand Down
Loading

0 comments on commit 71fcc97

Please sign in to comment.