diff --git a/filmatyk/containers.py b/filmatyk/containers.py index 8e9e226..5aa8be7 100644 --- a/filmatyk/containers.py +++ b/filmatyk/containers.py @@ -1,9 +1,11 @@ +from __future__ import annotations from datetime import date # This is a globally used dict that binds Item classes to their names. # It should remain empty, as the classes register themselves here. classByString = {} + class Blueprint(object): """Blueprint is an abstraction of a property that an Item might have. @@ -30,6 +32,9 @@ class Blueprint(object): Static methods define some basic, commonly used presentation functions for known types of properties. """ + + # Presentation styling callables + @staticmethod def _default(x): return str(x) @@ -57,6 +62,8 @@ def _rating(x): def _favourite(x): return '♥' if x == 1 else ' ' + # Functionality + def __init__(self, name:str, colwidth:int, parsing:dict={}, display=None, store=True): self.display_name = name self.column_width = colwidth @@ -76,6 +83,7 @@ def getHeading(self): def getColWidth(self): return self.column_width + class UserData(object): """Encapsulates user information associated with each Item instance. @@ -144,6 +152,7 @@ def serialize(self): serial['rating'] = self.wantto return serial + class BlueprintInheritance(type): """Changes the way inheritance works for Blueprints. Crucial for Item class. @@ -176,6 +185,7 @@ def __new__(cls, name, bases, dct): # The new class is now ready return c + class Item(metaclass=BlueprintInheritance): """Base for all types of records used by Filmweb and in the program. @@ -318,6 +328,21 @@ def asDict(self): _dict['userdata'] = self.userdata.serialize() return _dict + def update(self, other:Item): + """Update own properties from another Item. + + This is useful if the Item's Blueprinted properties have been altered (e.g. + because the remote data was updated) but there is also some custom data + attached to the Item that should not be removed. + + Important note: currently there are no properties requiring this behavior. + """ + for prop in self.storables: + if prop in other.properties.keys(): + self.properties[prop] = other.properties[prop] + self.userdata.addRating(other.userdata.rating) + + class Movie(Item): """Item subclass specialized to hold Movie instances.""" TYPE_STRING = 'FILM' @@ -349,6 +374,7 @@ class Movie(Item): def __init__(self, userdata:dict={}, **properties): super(Movie, self).__init__(userdata, **properties) + class Series(Movie): """Item subclass specialized to hold Series instances. @@ -366,6 +392,7 @@ class Series(Movie): def __init__(self, userdata:dict={}, **properties): super(Series, self).__init__(userdata, **properties) + class Game(Item): """Item subclass specialized to hold Game instances. diff --git a/filmatyk/database.py b/filmatyk/database.py index a90a341..c4d2014 100644 --- a/filmatyk/database.py +++ b/filmatyk/database.py @@ -5,6 +5,7 @@ import containers from filmweb import ConnectionError, FilmwebAPI + class Database(object): def __init__(self, itemtype:str, api:FilmwebAPI, callback:callable): self.itemtype = itemtype @@ -22,6 +23,7 @@ def getItemByID(self, id:int): for item in self.items: if item.getRawProperty('id') == id: return item + return None def __iter__(self): return self.items.__iter__() @@ -43,53 +45,194 @@ def storeToString(self): # Data acquisition def softUpdate(self): - self.callback(0) #display the progress bar - # ask the API how many items should there be and how many are there per page + """Quickly pull the most recent changes from Filmweb. + + The algorithm allows detecting additions and removals of items by comparing + the total item count in the local and remote databases and keeping track of + the items that differ between the two. + There are two fundamental problems that it solves are related to the fact + that getting to know the full state of the remote database is a very time- + consuming operation (it can only be fetched in chunks of n items, usually + n=25). Therefore the first problem is to determine how many pages to read + from the remote database. The second problem is related to detecting when + an item has been deleted remotely. + The solution can be described in the following * steps: + * compare the item counts between the databases - an update is to be made + only if there is a difference, + * fetch a new chunk of the remote database (a page), + * detect which items have been added or changed with respect to the local + (this makes use of a special HashedItem class, see its docs for details), + * identify the last non-changed remote item and find its local counterpart, + * split the local database into two parts: + * a "changed" part comprises all items up to and including this last non- + changed item - all these items are a potentially obsolete state of the + database, and they could be simply replaced with the currently held + remote items, + * an "unchanged" part comprises all other items - nothing about their + remote counterparts is known at this time. + * check whether merging the currently held remote items with the possibly + up-to-date unchanged part of the local database can satisfy the general + condition (that the local and remote databases should count the same). + At some point either the counts will even out, or all of the remote items + will be loaded. In either case the update completes. + + The problem this algorithm solves has one special form that is impossible + to overcome: when a symmetric change has occurred past a certain page. In + this case, any count-based algorithm will stop at the first chance it can + get (when it notices a count balance), ignoring any additions and removals + that may happen on further pages, if they balance out. + Example: + total change: length += 3 + page 1: 4 additions, 1 removal + page 2: 1 addition, 1 removal + The algorithm will reach balance after page 1 and not move on to page 2. + + Returns True in case of success, False if it aborted before completion. + """ + # Display the progress bar + self.callback(0) + # Ask the API how many items should there be (abort on network problems) try: - # in case there are network problems - first_request = self.api.getNumOf(self.itemtype) + num_request = self.api.getNumOf(self.itemtype) except ConnectionError: - self.callback(-1, abort=True) #hide the progress bar - return None - if first_request is None: - #this will happen if the user fails to log in self.callback(-1, abort=True) - return None - rated, per_page = first_request - # compute how many pages should be requested - if not rated or not per_page: - # will happen if the user does not have any items in the list - self.callback(-1) - return None - pages = ceil((rated-len(self.items))/per_page) - # request these pages from the API - itemPages = [] - for page in range(1, pages + 1): - itemPages.append(self.api.getItemsPage(itemtype=self.itemtype, page=page)) - perc_done = int(100 * page / pages) - self.callback(perc_done) #increment the progress bar - self.callback(100) #correct the rounding error - set the bar to full - new_items = [item for page in itemPages for item in page] - # no need to do anything if no new items were acquired - if len(new_items) == 0: + return False + # Exit if the user failed to log in + if num_request is None: + self.callback(-1, abort=True) + return False + # Workload estimation + local_count = len(self.items) + remote_count, items_per_page = num_request + still_need = remote_count - local_count + # Exit if nothing to download + if not remote_count or not items_per_page or not still_need: self.callback(-1) - return False # just in case this was an error during a hardUpdate - # add items to the database, replacing duplicates by new ones - old_items = self.items + return False + # Convert the existing database to a hashed format + local_hashed = list(HashedItem(item) for item in self.items) + local_hashed_dict = {item.id: item for item in local_hashed} + # Prepare to and run the main loop + remote_page_no = 0 + remote_items = [] + local_changed = [] + local_unchanged = [] + while still_need: + # Fetch a page and represent it in the hashed form + remote_page_no += 1 + fetched_items = list( + HashedItem(item) for item in + self.api.getItemsPage(self.itemtype, page=remote_page_no) + ) + # Detect additions and changes among the new items + for item in fetched_items: + local_item = local_hashed_dict.get(item.id, None) + # If this ID was not among known items - it's a simple addition + if not local_item: + item.added = True + item.changed = True + else: + # If it was, check if the data differs to detect a change + item.added = False + item.changed = item.hash != local_item.hash + # Store its local counterpart for a safe update + item.local_item = local_item + # Join the new items with the previously acquired but unprocessed ones + remote_items.extend(fetched_items) + # One edge case is that all of the remote items have been just acquired. + # This would happen when updating the Database for the first time. + if len(remote_items) == remote_count: + local_changed = local_hashed + local_unchanged = [] + break + # If the last remote item has been changed, it is difficult to figure out + # how do the currently known remote items relate to the local database. + # In such a case, another page is fetched, allowing a better view. + if remote_items[-1].changed: + continue + # Otherwise, locate the item in the local Database and split it. + last_unchanged_pos = local_hashed.index(remote_items[-1].id) + 1 + local_changed = local_hashed[:last_unchanged_pos] + local_unchanged = local_hashed[last_unchanged_pos:] + # Check if the databases would balance out if they were merged right now. + still_need = remote_count - (len(remote_items) + len(local_unchanged)) + # At this point the database can be reconstructed from the two components. + new_items = [] + # First, incorporate the changes from the remotely acquired items + for item in remote_items: + # If the item had a local counterpart, do not throw it away but instead + # update it with the remotely acquired data (allows preserving any local + # data that might not originate at the remote database). + if item.local_item: + local_item = item.local_item.parent + local_item.update(item.parent) + new_items.append(local_item) + else: + new_items.append(item.parent) + # Then add the rest of unchanged items. + new_items.extend(item.parent for item in local_unchanged) self.items = new_items - new_ids = [item['id'] for item in new_items] - for item in old_items: - if item['id'] not in new_ids: - self.items.append(item) + # Finalize - notify the GUI and potential caller. self.callback(-1) self.isDirty = True return True def hardUpdate(self): - # in theory, this removes all existing items and recollects the whole data - # but in practice this reacquisition may fail - in which case we shouldn't - # just lose the existing database and shrug, so this backs it up first + """Drop all the Items and reload all the data. + + This uses softUpdate under the hood. In case of its failure, no data is + lost as everything is backed up first. + """ old_items = self.items self.items = [] if not self.softUpdate(): self.items = old_items + + +class HashedItem(): + """A hashed representation of an Item that allows detecting changes. + + Computing a standard hash of the Item's UserData makes it possible to detect + when an Item was not just added or removed but also whether it has changed + with respect to the locally stored version of that Item. + Flags indicating whether an item was added or changed are helpful in the + process of performing an update. + + In theory, the Item class itself could implement the hashing functionality, + but doing this in a separate technical class also allows storing the flags, + which would only clutter the base class. + + Some caveats: + * HashedItem also maintains a reference to the original item that it has been + created from. This is convenient during the update operation, as it allows + operating directly on the list of HashedItems instead of having to ensure + that each list-changing operation happens both on the list of hashes and + the list of the original items. + * When used to hash a remotely acquired item, the corresponding local version + of that item can be attached to the HashedItem. This saves an additional + search operation later in the update process. + * HashedItem can be equality-compared with not only instances of the same + type, but also ints. This makes it possible to search for an integer ID in + a list of HashedItems. + """ + hash_data = ['rating', 'comment', 'dateOf'] + + def __init__(self, item:containers.Item): + self.parent = item + self.id = item.getRawProperty('id') + self.hash = self.computeHash(item) + # Flags used to compare remote items with the local ones + self.added = None + self.changed = None + self.local_item = None + + def computeHash(self, item:containers.Item): + """Summarize UserData of an Item by a simple hash function.""" + userDataString = '#'.join(item[prop] for prop in self.hash_data) + return hash(userDataString) + + def __eq__(self, other): + if isinstance(other, int): + return self.id == other + else: + return super(HashedItem, self).__eq__(other) diff --git a/test/test_database.py b/test/test_database.py index 17b563e..b6f2973 100644 --- a/test/test_database.py +++ b/test/test_database.py @@ -152,7 +152,7 @@ class TestDatabaseCreation(unittest.TestCase): """Basic test for Database loading data from scratch using the API.""" @classmethod def setUpClass(self): - self.api = FakeAPI('data') + self.api = FakeAPI('assets') def test_creation(self): """Create a new Database and fill it with items using (Fake)API. @@ -181,7 +181,7 @@ class TestDatabaseSerialization(unittest.TestCase): """ @classmethod def setUpClass(self): - self.api = FakeAPI('data') + self.api = FakeAPI('assets') def test_serialization(self): """Serialize and deserialize a Database, check if they look the same.""" @@ -274,6 +274,11 @@ def test_simpleAddition(self): scenario = UpdateScenario(removals=[0, 1, 2]) self.__test_body(scenario) + def test_massiveAddition(self): + """Add over one full page of new items.""" + scenario = UpdateScenario(removals=list(range(37))) + self.__test_body(scenario) + def test_randomAddition(self): """Add an item missing from somewhere on the first page.""" scenario = UpdateScenario(removals=[4]) @@ -284,8 +289,6 @@ def test_nonContinuousAddition(self): scenario = UpdateScenario(removals=[0, 1, 2, 3, 6]) self.__test_body(scenario) - @unittest.expectedFailure - # The current algorithm is very naive and thus unable to do that. def test_multipageAddition(self): """Add a few items non-continuously missing from multiple pages.""" scenario = UpdateScenario(removals=[0, 1, 2, 16, 30, 32]) @@ -293,25 +296,21 @@ def test_multipageAddition(self): # Removal tests - are all expected to fail at this moment. - @unittest.expectedFailure def test_singleRemoval(self): """Remove a single item from the first page.""" scenario = UpdateScenario(additions=[(0, 666)]) self.__test_body(scenario) - @unittest.expectedFailure def test_simpleRemoval(self): """Remove a few items from the first page.""" scenario = UpdateScenario(additions=[(0, 666), (1, 4270)]) self.__test_body(scenario) - @unittest.expectedFailure def test_randomRemoval(self): """Remove an item from somewhere on the first page.""" scenario = UpdateScenario(additions=[(4, 420)]) self.__test_body(scenario) - @unittest.expectedFailure def test_nonContinuousRemoval(self): """Remove a few items non-continuously from the first page.""" scenario = UpdateScenario( @@ -319,7 +318,6 @@ def test_nonContinuousRemoval(self): ) self.__test_body(scenario) - @unittest.expectedFailure def test_multipageRemoval(self): """Remove a few items non-continuously from multiple pages.""" scenario = UpdateScenario( @@ -329,8 +327,7 @@ def test_multipageRemoval(self): # Other tests - for future features. - @unittest.expectedFailure - def test_complexAdditionRemoval(self): + def test_additionRemoval(self): """Add and remove a few items at once, but only from the first page.""" scenario = UpdateScenario( removals=[0, 1, 2, 9, 13], @@ -338,9 +335,17 @@ def test_complexAdditionRemoval(self): ) self.__test_body(scenario) + def test_complexAdditionRemoval(self): + """Add and remove a few items at once from multiple pages.""" + scenario = UpdateScenario( + removals=[0, 1, 2, 9, 23, 35, 36], + additions=[(3, 1991), (4, 37132), (28, 628)] + ) + self.__test_body(scenario) + @unittest.skip('Relevant feature not implemented yet.') def test_difficultAdditionRemoval(self): - """Add and remove a few items at once from multiple pages. + """Add and remove a few items at once from multiple pages WITH BALANCE. This test is extremely difficult because it is impossible to recognize such scenario in real usage (online), by looking at getNumOf alone. That number