diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..bdedf789 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,56 @@ +"Test to check intersection logic when no intersection area returned" +import os +import sys + +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfpage import PDFTextExtractionNotAllowed +from pdfminer.pdfinterp import PDFResourceManager +from pdfminer.pdfinterp import PDFPageInterpreter +from pdfminer.converter import PDFPageAggregator +from pdfminer.layout import ( + LAParams, + LTAnno, + LTChar, + LTTextLineHorizontal, + LTTextLineVertical, + LTImage, + LTTextBoxHorizontal +) + +testdir = os.path.dirname(os.path.abspath(__file__)) +testdir = os.path.join(testdir, "files") + +from camelot.utils import bbox_intersection_area + +def get_text_from_pdf(filename): + "Method to extract text object from pdf" + #https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file + #https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis + document = open(filename, 'rb') + #Create resource manager + rsrcmgr = PDFResourceManager() + # Set parameters for analysis. + laparams = LAParams() + # Create a PDF page aggregator object. + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + for page in PDFPage.get_pages(document): + interpreter.process_page(page) + # receive the LTPage object for the page. + layout = device.get_result() + for element in layout: + if isinstance(element, LTTextBoxHorizontal): + return element + +def test_bbox_intersection_text(): + """ + Test to check area of intersection between both boxes when no intersection area returned + """ + filename1 = os.path.join(testdir, "foo.pdf") + pdftextelement1 = get_text_from_pdf(filename1) + filename2 = os.path.join(testdir, "tabula/12s0324.pdf") + pdftextelement2 = get_text_from_pdf(filename2) + + assert bbox_intersection_area(pdftextelement1, pdftextelement2) == 0.0