diff --git a/cl/recap/factories.py b/cl/recap/factories.py index 9b786ed4fd..64f3afb714 100644 --- a/cl/recap/factories.py +++ b/cl/recap/factories.py @@ -93,6 +93,7 @@ class RECAPEmailDocketEntryDataFactory(DictFactory): pacer_doc_id = Faker("random_id_string") pacer_magic_num = Faker("random_id_string") pacer_seq_no = Faker("random_id_string") + short_description = Faker("text", max_nb_chars=15) class RECAPEmailDocketDataFactory(DictFactory): diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 0bbef5a5ec..95fd75cc98 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -822,6 +822,35 @@ async def get_or_make_docket_entry( return de, de_created +async def keep_latest_rd_document(queryset: QuerySet) -> RECAPDocument: + """Retains the most recent item with a PDF, if available otherwise, + retains the most recent item overall. + + :param queryset: RECAPDocument QuerySet to clean duplicates from. + :return: The matched RECAPDocument after cleaning. + """ + rd_with_pdf_queryset = queryset.filter(is_available=True).exclude( + filepath_local="" + ) + if await rd_with_pdf_queryset.aexists(): + rd = await rd_with_pdf_queryset.alatest("date_created") + else: + rd = await queryset.alatest("date_created") + await queryset.exclude(pk=rd.pk).adelete() + return rd + + +async def clean_duplicate_documents(params: dict[str, Any]) -> RECAPDocument: + """Removes duplicate RECAPDocuments, keeping the most recent with PDF if + available or otherwise the most recent overall. + + :param params: Query parameters to filter the RECAPDocuments. + :return: The matched RECAPDocument after cleaning. + """ + duplicate_rd_queryset = RECAPDocument.objects.filter(**params) + return await keep_latest_rd_document(duplicate_rd_queryset) + + async def add_docket_entries( d: Docket, docket_entries: list[dict[str, Any]], @@ -934,17 +963,28 @@ async def add_docket_entries( rd = await RECAPDocument.objects.aget(**get_params) rds_updated.append(rd) except RECAPDocument.DoesNotExist: - try: - params["pacer_doc_id"] = docket_entry["pacer_doc_id"] - rd = await RECAPDocument.objects.acreate( - document_number=docket_entry["document_number"] or "", - is_available=False, - **params, - ) - except ValidationError: - # Happens from race conditions. - continue - rds_created.append(rd) + rd = None + if de_created is False and not appelate_court_id_exists: + try: + # Check for documents with a bad pacer_doc_id + rd = await RECAPDocument.objects.aget(**params) + except RECAPDocument.DoesNotExist: + # Fallback to creating document + pass + except RECAPDocument.MultipleObjectsReturned: + rd = await clean_duplicate_documents(params) + if rd is None: + try: + params["pacer_doc_id"] = docket_entry["pacer_doc_id"] + rd = await RECAPDocument.objects.acreate( + document_number=docket_entry["document_number"] or "", + is_available=False, + **params, + ) + rds_created.append(rd) + except ValidationError: + # Happens from race conditions. + continue except RECAPDocument.MultipleObjectsReturned: logger.info( "Multiple recap documents found for document entry number'%s' " @@ -952,17 +992,10 @@ async def add_docket_entries( ) if params["document_type"] == RECAPDocument.ATTACHMENT: continue - duplicate_rd_queryset = RECAPDocument.objects.filter(**params) - rd_with_pdf_queryset = duplicate_rd_queryset.filter( - is_available=True - ).exclude(filepath_local="") - if await rd_with_pdf_queryset.aexists(): - rd = await rd_with_pdf_queryset.alatest("date_created") - else: - rd = await duplicate_rd_queryset.alatest("date_created") - await duplicate_rd_queryset.exclude(pk=rd.pk).adelete() + rd = await clean_duplicate_documents(params) - rd.pacer_doc_id = rd.pacer_doc_id or docket_entry["pacer_doc_id"] + if docket_entry["pacer_doc_id"]: + rd.pacer_doc_id = docket_entry["pacer_doc_id"] description = docket_entry.get("short_description") if rd.document_type == RECAPDocument.PACER_DOCUMENT and description: rd.description = description @@ -1604,14 +1637,7 @@ async def clean_duplicate_attachment_entries( ) async for dupe in dupes.aiterator(): duplicate_rd_queryset = rds.filter(pacer_doc_id=dupe.pacer_doc_id) - rd_with_pdf_queryset = duplicate_rd_queryset.filter( - is_available=True - ).exclude(filepath_local="") - if await rd_with_pdf_queryset.aexists(): - keep_rd = await rd_with_pdf_queryset.alatest("date_created") - else: - keep_rd = await duplicate_rd_queryset.alatest("date_created") - await duplicate_rd_queryset.exclude(pk=keep_rd.pk).adelete() + await keep_latest_rd_document(duplicate_rd_queryset) async def merge_attachment_page_data( @@ -1673,15 +1699,7 @@ async def merge_attachment_page_data( except RECAPDocument.MultipleObjectsReturned as exc: if pacer_case_id: - duplicate_rd_queryset = RECAPDocument.objects.filter(**params) - rd_with_pdf_queryset = duplicate_rd_queryset.filter( - is_available=True - ).exclude(filepath_local="") - if await rd_with_pdf_queryset.aexists(): - keep_rd = await rd_with_pdf_queryset.alatest("date_created") - else: - keep_rd = await duplicate_rd_queryset.alatest("date_created") - await duplicate_rd_queryset.exclude(pk=keep_rd.pk).adelete() + await clean_duplicate_documents(params) main_rd = await RECAPDocument.objects.select_related( "docket_entry", "docket_entry__docket" ).aget(**params) @@ -1711,23 +1729,7 @@ async def merge_attachment_page_data( break except RECAPDocument.MultipleObjectsReturned as exc: if pacer_case_id: - duplicate_rd_queryset = RECAPDocument.objects.filter( - **params - ) - rd_with_pdf_queryset = duplicate_rd_queryset.filter( - is_available=True - ).exclude(filepath_local="") - if await rd_with_pdf_queryset.aexists(): - keep_rd = await rd_with_pdf_queryset.alatest( - "date_created" - ) - else: - keep_rd = await duplicate_rd_queryset.alatest( - "date_created" - ) - await duplicate_rd_queryset.exclude( - pk=keep_rd.pk - ).adelete() + await clean_duplicate_documents(params) main_rd = await RECAPDocument.objects.select_related( "docket_entry", "docket_entry__docket" ).aget(**params) diff --git a/cl/recap/tests.py b/cl/recap/tests.py index bb249b6246..d82b06354a 100644 --- a/cl/recap/tests.py +++ b/cl/recap/tests.py @@ -1072,6 +1072,102 @@ def test_process_attachments_for_subdocket_pq_with_missing_main_rd( msg="Didn't get the expected docket ID.", ) + def test_match_recap_document_with_wrong_pacer_doc_id(self, mock_upload): + """Confirm that when an existing RECAPDocument has an invalid + pacer_doc_id, we can still match it after excluding the pacer_doc_id + from the lookup. + """ + + de_data = DocketEntriesDataFactory( + docket_entries=[ + RECAPEmailDocketEntryDataFactory( + pacer_doc_id="04505578690", + document_number=5, + ) + ], + ) + de = DocketEntryWithParentsFactory( + docket__court=self.court, entry_number=5 + ) + rd = RECAPDocumentFactory( + docket_entry=de, + document_type=RECAPDocument.PACER_DOCUMENT, + pacer_doc_id="04505578691", + document_number="5", + description="", + ) + # Add the docket entry with the updated pacer_doc_id + async_to_sync(add_docket_entries)(de.docket, de_data["docket_entries"]) + recap_documents = RECAPDocument.objects.all() + self.assertEqual( + recap_documents.count(), 1, msg="Wrong number of RECAPDocuments" + ) + rd.refresh_from_db() + self.assertEqual( + rd.description, + de_data["docket_entries"][0]["short_description"], + msg="The short description doesn't match.", + ) + self.assertEqual( + rd.pacer_doc_id, + de_data["docket_entries"][0]["pacer_doc_id"], + msg="The pacer_doc_id doesn't match.", + ) + + def test_match_recap_document_with_wrong_pacer_doc_id_duplicated( + self, mock_upload + ): + """Confirm that when an existing RECAPDocument has an invalid + pacer_doc_id, we can still match it after excluding the pacer_doc_id + from the lookup, even if there is more than one PACER_DOCUMENT that + belongs to the docket entry. + """ + + de_data = DocketEntriesDataFactory( + docket_entries=[ + RECAPEmailDocketEntryDataFactory( + pacer_doc_id="04505578690", + document_number=5, + ) + ], + ) + de = DocketEntryWithParentsFactory( + docket__court=self.court, entry_number=5 + ) + RECAPDocumentFactory( + document_type=RECAPDocument.PACER_DOCUMENT, + docket_entry=de, + pacer_doc_id="04505578691", + document_number="5", + description="", + ) + rd_2 = RECAPDocumentFactory( + document_type=RECAPDocument.PACER_DOCUMENT, + docket_entry=de, + pacer_doc_id="04505578691", + document_number="6", + description="", + is_available=True, + ) + # Add the docket entry with the updated pacer_doc_id, remove the + # duplicated RD, and keep the one that is available. + async_to_sync(add_docket_entries)(de.docket, de_data["docket_entries"]) + recap_documents = RECAPDocument.objects.all() + self.assertEqual( + recap_documents.count(), 1, msg="Wrong number of RECAPDocuments" + ) + rd_2.refresh_from_db() + self.assertEqual( + rd_2.description, + de_data["docket_entries"][0]["short_description"], + msg="The short description doesn't match.", + ) + self.assertEqual( + rd_2.pacer_doc_id, + de_data["docket_entries"][0]["pacer_doc_id"], + msg="The pacer_doc_id doesn't match.", + ) + @mock.patch("cl.recap.tasks.DocketReport", new=fakes.FakeDocketReport) @mock.patch(