diff --git a/.gitignore b/.gitignore
index 5f112e2..d0a5989 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,9 @@ tmp*
tags
*.pkg
+#Large data files
+# nbs/data
+
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
diff --git a/HerbariaOCR/AzureVision.py b/HerbariaOCR/AzureVision.py
new file mode 100644
index 0000000..d3c0511
--- /dev/null
+++ b/HerbariaOCR/AzureVision.py
@@ -0,0 +1,583 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_AzureVision.ipynb.
+
+# %% auto 0
+__all__ = ['endpoint', 'key', 'page_width', 'page_height', 'sanitize_filename', 'format_bounding_box', 'draw_boxes',
+ 'calculate_scale', 'extract_info', 'analyze_read', 'model_function', 'get_text_density_map',
+ 'find_highest_density_area', 'crop_image_to_text', 'get_text_bounding_boxes', 'combine_text_regions',
+ 'parse_document_content', 'analyze_text_density_and_crop']
+
+# %% ../nbs/00_AzureVision.ipynb 10
+# Azure Cognitive Services endpoint and key
+endpoint = "https://herbariumsamplerecognition.cognitiveservices.azure.com/"
+key = "your key here"
+
+def sanitize_filename(filename):
+ # Remove characters that are not alphanumeric, spaces, dots, or underscores
+ return re.sub(r'[^\w\s\.-]', '', filename)
+
+def format_bounding_box(bounding_box):
+ if not bounding_box:
+ return "N/A"
+ return ", ".join(["[{}, {}]".format(p.x, p.y) for p in bounding_box])
+
+
+def draw_boxes(image_path, words):
+ original_image = Image.open(image_path)
+ annotated_image = original_image.copy()
+ draw = ImageDraw.Draw(annotated_image)
+
+ for word in words:
+ polygon = word['polygon']
+ if polygon:
+ bbox = [(point.x, point.y) for point in polygon]
+ try:
+ # Replace special characters that cannot be encoded in 'latin-1'
+ text_content = word['content'].encode('ascii', 'ignore').decode('ascii')
+ except Exception as e:
+ print(f"Error processing text {word['content']}: {e}")
+ text_content = "Error"
+ draw.polygon(bbox, outline="red")
+ draw.text((bbox[0][0], bbox[0][1]), text_content, fill="green")
+
+
+ return annotated_image
+
+page_width, page_height = letter
+
+# Function to calculate scale to fit the image within page dimensions
+def calculate_scale(image, max_width, max_height):
+ scale_w = max_width / image.width
+ scale_h = max_height / image.height
+ return min(scale_w, scale_h)
+
+
+def extract_info(text):
+ # Set your OpenAI API key
+ openai.api_key = 'your key here'
+
+ # Prepare the prompt for the API
+ prompt = f"From the provided text, return only the relevant information in a JSON format according to the Darwin Core standard for biodiversity specimen:\n{text}"
+
+ try:
+ # Send the request to the API
+ response = openai.ChatCompletion.create(
+ model="gpt-4-1106-preview",
+ messages=[{"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": prompt}]
+ )
+
+ # Extract the response
+ if response.choices:
+ return response.choices[0].message['content']
+ else:
+ return "No response from the API."
+
+ except Exception as e:
+ return f"An error occurred: {str(e)}"
+
+
+def analyze_read(image_path, output_path):
+ try:
+ with open(image_path, "rb") as f:
+ image_stream = f.read()
+
+ document_analysis_client = DocumentAnalysisClient(
+ endpoint=endpoint, credential=AzureKeyCredential(key)
+ )
+
+ poller = document_analysis_client.begin_analyze_document(
+ "prebuilt-read", image_stream)
+ result = poller.result()
+
+ # Collect words, their polygon data, and confidence
+ words = []
+ confidence_text = ""
+ for page in result.pages:
+ for word in page.words:
+ words.append({
+ 'content': word.content,
+ 'polygon': word.polygon
+ })
+ confidence_text += "'{}' confidence {}\n".format(word.content, word.confidence)
+
+ document_content = result.content + "\n\nConfidence Metrics:\n" + confidence_text
+ extracted_info = extract_info(document_content)
+ print(extracted_info)
+ original_image = Image.open(image_path)
+ annotated_img = draw_boxes(image_path, words)
+
+
+ # Set up PDF
+ output_filename = os.path.join(output_path, sanitize_filename(os.path.basename(image_path).replace('.jpg', '.pdf')))
+ c = canvas.Canvas(output_filename, pagesize=letter)
+ width, height = letter # usually 612 x 792
+
+ # Draw original image
+ if original_image.height <= height:
+ c.drawImage(image_path, 0, height - original_image.height, width=original_image.width, height=original_image.height, mask='auto')
+ y_position = height - original_image.height
+ else:
+ # Handle large images or add scaling logic here
+ pass
+
+
+ # Draw original image
+ scale = calculate_scale(original_image, page_width, page_height)
+ img_width, img_height = original_image.width * scale, original_image.height * scale
+ c.drawImage(image_path, 0, page_height - img_height, width=img_width, height=img_height, mask='auto')
+ y_position = page_height - img_height
+
+ # Draw annotated image
+ annotated_image_path = '/tmp/annotated_image.png'
+ annotated_img.save(annotated_image_path)
+ scale = calculate_scale(annotated_img, page_width, page_height)
+ annotated_img_width, annotated_img_height = annotated_img.width * scale, annotated_img.height * scale
+ if y_position - annotated_img_height >= 0:
+ c.drawImage(annotated_image_path, 0, y_position - annotated_img_height, width=annotated_img_width, height=annotated_img_height, mask='auto')
+ else:
+ c.showPage()
+ c.drawImage(annotated_image_path, 0, page_height - annotated_img_height, width=annotated_img_width, height=annotated_img_height, mask='auto')
+
+
+ # Add text
+ textobject = c.beginText()
+ textobject.setTextOrigin(10, y_position - 15)
+ textobject.setFont("Times-Roman", 12)
+
+ for line in document_content.split('\n'):
+ if textobject.getY() - 15 < 0: # Check if new page is needed for more text
+ c.drawText(textobject)
+ c.showPage()
+ textobject = c.beginText()
+ textobject.setTextOrigin(10, height - 15)
+ textobject.setFont("Times-Roman", 12)
+ textobject.textLine(line)
+
+ c.drawText(textobject)
+ c.save()
+
+ except Exception as e:
+ print(f"An error occurred while processing {image_path}: {e}")
+
+
+if __name__ == "__main__":
+
+ input_folder = 'data/resized-images/'
+ output_folder = 'data/temp/'
+ # Create the output folder if it doesn't exist
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
+ image_counter = 0 # Initialize a counter for the number of images processed
+
+ # Iterate over each image in the input folder
+ for image_file in os.listdir(input_folder):
+ image_path = os.path.join(input_folder, image_file)
+
+ # Check if the file is an image
+ if image_path.lower().endswith(('.png', '.jpg', '.jpeg')):
+ analyze_read(image_path, output_folder)
+ image_counter += 1 # Increment the counter
+
+ if image_counter >= 5: # Stop after processing x images
+ break
+
+
+
+
+
+# %% ../nbs/00_AzureVision.ipynb 12
+# Azure Cognitive Services endpoint and key
+endpoint = "https://herbariumsamplerecognition.cognitiveservices.azure.com/"
+key = "your key here"
+
+def sanitize_filename(filename):
+ # Remove characters that are not alphanumeric, spaces, dots, or underscores
+ return re.sub(r'[^\w\s\.-]', '', filename)
+
+def extract_info(text):
+ # Set your OpenAI API key
+ openai.api_key = 'your key here'
+
+ # Prepare the prompt for the API
+ prompt = f"From the provided text, return only the relevant information in a JSON format according to the Darwin Core standard for biodiversity specimen. Note: make sure that each output has a 'country' field. If you do not find an explicit country, make your best guess at the country using the context of the other text.\n{text}\n{text}"
+
+ try:
+ # Send the request to the API
+ response = openai.ChatCompletion.create(
+ model="gpt-4-1106-preview",
+ messages=[{"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": prompt}]
+ )
+
+ # Extract the response
+ return response.choices[0].message['content'] if response.choices else "No response from the API."
+
+ except Exception as e:
+ return f"An error occurred: {str(e)}"
+
+def analyze_read(image_path, output_path):
+ try:
+ with open(image_path, "rb") as f:
+ image_stream = f.read()
+
+ document_analysis_client = DocumentAnalysisClient(
+ endpoint=endpoint, credential=AzureKeyCredential(key)
+ )
+
+ poller = document_analysis_client.begin_analyze_document(
+ "prebuilt-read", image_stream)
+ result = poller.result()
+
+ # Collect the content from the document
+ document_content = result.content
+ extracted_info = extract_info(document_content)
+
+ # Save the extracted information to a text file
+ output_filename = os.path.join(output_path, sanitize_filename(os.path.basename(image_path).replace('.jpg', '.txt')))
+ with open(output_filename, 'w') as text_file:
+ text_file.write(extracted_info)
+
+ except Exception as e:
+ print(f"An error occurred while processing {image_path}: {e}")
+
+if __name__ == "__main__":
+
+ input_folder = 'data/resized-images/'
+ output_folder = 'data/AzureVisionResults/'
+ # Create the output folder if it doesn't exist
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
+ image_counter = 0 # Initialize a counter for the number of images processed
+
+ # Iterate over each image in the input folder
+ for image_file in os.listdir(input_folder):
+ image_path = os.path.join(input_folder, image_file)
+
+ # Check if the file is an image
+ if image_path.lower().endswith(('.png', '.jpg', '.jpeg')):
+ analyze_read(image_path, output_folder)
+ image_counter += 1 # Increment the counter
+
+ if image_counter >= 224: # Stop after processing x images
+ break
+
+
+# %% ../nbs/00_AzureVision.ipynb 15
+# Azure Cognitive Services endpoint and key
+endpoint = "https://herbariumsamplerecognition.cognitiveservices.azure.com/"
+key = "your key here"
+
+
+def sanitize_filename(filename):
+ # Remove characters that are not alphanumeric, spaces, dots, or underscores
+ return re.sub(r'[^\w\s\.-]', '', filename)
+
+def extract_info(text):
+ # Set your OpenAI API key
+ openai.api_key = 'your key here'
+
+ # Prepare the prompt for the API
+ prompt = f"From the provided text, return only the relevant information in a JSON format according to the Darwin Core standard for biodiversity specimen. Note: make sure that each output has a 'country' field. If you do not find an explicit country, make your best guess at the country using the context of the other text.\n{text}\n{text}"
+
+ try:
+ # Send the request to the API
+ response = openai.ChatCompletion.create(
+ model="gpt-4-1106-preview",
+ messages=[{"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": prompt}]
+ )
+
+ # Extract the response
+ return response.choices[0].message['content'] if response.choices else "No response from the API."
+
+ except Exception as e:
+ return f"An error occurred: {str(e)}"
+
+def analyze_read(image_stream):
+ try:
+ document_analysis_client = DocumentAnalysisClient(
+ endpoint=endpoint, credential=AzureKeyCredential(key)
+ )
+
+ poller = document_analysis_client.begin_analyze_document(
+ "prebuilt-read", image_stream)
+ result = poller.result()
+
+ # Collect the content from the document
+ document_content = result.content
+ extracted_info = extract_info(document_content)
+
+ return extracted_info
+
+ except Exception as e:
+ return f"An error occurred: {str(e)}"
+
+def model_function(image):
+ # Convert the NumPy array to a PIL Image object
+ image = Image.fromarray(np.uint8(image)).convert('RGB')
+
+ # Convert the uploaded image to a byte stream
+ image_bytes = io.BytesIO()
+ image.save(image_bytes, format='JPEG') # Using 'JPEG' as the format
+ image_bytes = image_bytes.getvalue()
+
+ output_text = analyze_read(image_bytes)
+ return output_text
+
+# iface = gr.Interface(fn=model_function, inputs="image", outputs="text")
+# iface.launch(share=True)
+
+
+# %% ../nbs/00_AzureVision.ipynb 17
+# Azure Cognitive Services endpoint and key
+endpoint = "https://herbariumsamplerecognition.cognitiveservices.azure.com/"
+key = "AZURE KEY HERE"
+
+def sanitize_filename(filename):
+ # Remove characters that are not alphanumeric, spaces, dots, or underscores
+ return re.sub(r'[^\w\s\.-]', '', filename)
+
+def format_bounding_box(bounding_box):
+ if not bounding_box:
+ return "N/A"
+ return ", ".join(["[{}, {}]".format(p.x, p.y) for p in bounding_box])
+
+def draw_boxes(image_path, words):
+ original_image = Image.open(image_path)
+ annotated_image = original_image.copy()
+ draw = ImageDraw.Draw(annotated_image)
+
+ for word in words:
+ polygon = word['polygon']
+ if polygon:
+ bbox = [(point.x, point.y) for point in polygon]
+ try:
+ # Replace special characters that cannot be encoded in 'latin-1'
+ text_content = word['content'].encode('ascii', 'ignore').decode('ascii')
+ except Exception as e:
+ print(f"Error processing text {word['content']}: {e}")
+ text_content = "Error"
+ draw.polygon(bbox, outline="red")
+ draw.text((bbox[0][0], bbox[0][1]), text_content, fill="green")
+
+ return annotated_image
+
+page_width, page_height = letter
+
+# Function to calculate scale to fit the image within page dimensions
+def calculate_scale(image, max_width, max_height):
+ scale_w = max_width / image.width
+ scale_h = max_height / image.height
+ return min(scale_w, scale_h)
+
+
+def get_text_density_map(pages):
+ density_map = {}
+ for page in pages:
+ for line in page.lines:
+ points = line.polygon
+ if points:
+ x_center = sum(point.x for point in points) / len(points)
+ y_center = sum(point.y for point in points) / len(points)
+ density_map[(x_center, y_center)] = density_map.get((x_center, y_center), 0) + 1
+ return density_map
+
+def find_highest_density_area(density_map):
+ # This function will find the center of the area with the highest text density
+ # For simplicity, this example just returns the center with the highest count
+ # In a real scenario, you might want to consider a more sophisticated method
+ # that takes into account the size and proximity of the high-density areas
+ return max(density_map, key=density_map.get)
+
+def crop_image_to_text(image_path, density_center, crop_size):
+ with Image.open(image_path) as img:
+ # Calculate the coordinates for the crop
+ left = max(density_center[0] - crop_size[0] // 2, 0)
+ upper = max(density_center[1] - crop_size[1] // 2, 0)
+ right = min(density_center[0] + crop_size[0] // 2, img.width)
+ lower = min(density_center[1] + crop_size[1] // 2, img.height)
+
+ # Debug output
+ print(f"Cropping coordinates: left={left}, upper={upper}, right={right}, lower={lower}")
+
+ # Perform the crop
+ cropped_img = img.crop((left, upper, right, lower))
+ return cropped_img
+
+
+def get_text_bounding_boxes(pages):
+ bounding_boxes = []
+ for page in pages:
+ for line in page.lines:
+ if line.polygon:
+ box = [(point.x, point.y) for point in line.polygon]
+ bounding_boxes.append(box)
+ return bounding_boxes
+
+def combine_text_regions(image_path, bounding_boxes):
+ original_image = Image.open(image_path)
+
+ # Calculate the combined bounding box
+ min_x = min(min(box, key=lambda x: x[0])[0] for box in bounding_boxes)
+ min_y = min(min(box, key=lambda x: x[1])[1] for box in bounding_boxes)
+ max_x = max(max(box, key=lambda x: x[0])[0] for box in bounding_boxes)
+ max_y = max(max(box, key=lambda x: x[1])[1] for box in bounding_boxes)
+
+ # Create a new blank image with integer dimensions
+ combined_image = Image.new('RGB', (int(max_x - min_x), int(max_y - min_y)), (255, 255, 255))
+
+ for box in bounding_boxes:
+ cropped_region = original_image.crop((min(box, key=lambda x: x[0])[0],
+ min(box, key=lambda x: x[1])[1],
+ max(box, key=lambda x: x[0])[0],
+ max(box, key=lambda x: x[1])[1]))
+ # Paste the cropped region at integer coordinates
+ combined_image.paste(cropped_region, (int(box[0][0] - min_x), int(box[0][1] - min_y)))
+
+ return combined_image
+
+
+
+
+
+
+###################################################
+def parse_document_content(content):
+ openai.api_key = 'your-api-key'
+
+ try:
+ response = openai.Completion.create(
+ model="gpt-4",
+ prompt=f"Extract specific information from the following text: {content}\n\nSpecies Name: ",
+ max_tokens=100
+ # Add additional parameters as needed
+ )
+ parsed_data = response.choices[0].text.strip()
+ return parsed_data
+ except Exception as e:
+ print("An error occurred:", e)
+ return None
+####################################################
+
+def analyze_text_density_and_crop(image_path):
+ document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+ with open(image_path, "rb") as f:
+ image_stream = f.read()
+
+ poller = document_analysis_client.begin_analyze_document("prebuilt-read", image_stream)
+ result = poller.result()
+
+ # Get bounding boxes of text regions
+ bounding_boxes = get_text_bounding_boxes(result.pages)
+
+ # Combine the text regions into one image
+ combined_image = combine_text_regions(image_path, bounding_boxes)
+
+ # Save the combined image temporarily and return its path
+ combined_image_path = '/tmp/combined_image.png'
+ combined_image.save(combined_image_path)
+ return combined_image_path
+
+
+def analyze_read(image_path, output_path, show_first_output=False):
+ combined_image_path = analyze_text_density_and_crop(image_path)
+
+ try:
+ # Process the combined image with Azure Form Recognizer
+ with open(combined_image_path, "rb") as f:
+ combined_image_stream = f.read()
+
+ document_analysis_client = DocumentAnalysisClient(
+ endpoint=endpoint, credential=AzureKeyCredential(key)
+ )
+
+ poller = document_analysis_client.begin_analyze_document(
+ "prebuilt-read", combined_image_stream)
+ result = poller.result()
+
+ # Collect words, their polygon data, and confidence
+ words = []
+ for page in result.pages:
+ for word in page.words:
+ words.append({
+ 'content': word.content,
+ 'polygon': word.polygon
+ })
+
+ # Prepare annotated image
+ annotated_img = draw_boxes(combined_image_path, words)
+
+ # Set up PDF
+ output_filename = os.path.join(output_path, sanitize_filename(os.path.basename(image_path).replace('.jpg', '.pdf')))
+ c = canvas.Canvas(output_filename, pagesize=letter)
+
+ # Draw original image
+ original_image = Image.open(image_path)
+ scale = calculate_scale(original_image, page_width, page_height)
+ img_width, img_height = original_image.width * scale, original_image.height * scale
+ c.drawImage(image_path, 0, page_height - img_height, width=img_width, height=img_height, mask='auto')
+ y_position = page_height - img_height
+
+ # Draw annotated combined image
+ annotated_image_path = '/tmp/annotated_image.png'
+ annotated_img.save(annotated_image_path)
+ scale = calculate_scale(annotated_img, page_width, page_height)
+ annotated_img_width, annotated_img_height = annotated_img.width * scale, annotated_img.height * scale
+ if y_position - annotated_img_height >= 0:
+ c.drawImage(annotated_image_path, 0, y_position - annotated_img_height, width=annotated_img_width, height=annotated_img_height, mask='auto')
+ else:
+ c.showPage() # Start a new page if not enough space
+ c.drawImage(annotated_image_path, 0, page_height - annotated_img_height, width=annotated_img_width, height=annotated_img_height, mask='auto')
+
+ # Add text
+ textobject = c.beginText()
+ textobject.setTextOrigin(10, y_position - 15)
+ textobject.setFont("Times-Roman", 12)
+
+ document_content = '\n'.join([word['content'] for word in words])
+ for line in document_content.split('\n'):
+ if textobject.getY() - 15 < 0: # Check if new page is needed for more text
+ c.drawText(textobject)
+ c.showPage()
+ textobject = c.beginText()
+ textobject.setTextOrigin(10, page_height - 15)
+ textobject.setFont("Times-Roman", 12)
+ textobject.textLine(line)
+
+ c.drawText(textobject)
+ c.save()
+
+ except Exception as e:
+ print(f"An error occurred while processing {combined_image_path}: {e}")
+
+
+
+
+if __name__ == "__main__":
+
+ input_folder = 'data/resized-images/'
+ output_folder = 'data/AzureVisioncrop/'
+ # Create the output folder if it doesn't exist
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
+ image_counter = 0 # Initialize a counter for the number of images processed
+
+ # Iterate over each image in the input folder
+ for image_file in os.listdir(input_folder):
+ image_path = os.path.join(input_folder, image_file)
+
+ # Check if the file is an image
+ if image_path.lower().endswith(('.png', '.jpg', '.jpeg')):
+ analyze_read(image_path, output_folder, show_first_output=not first_output_shown)
+ image_counter += 1 # Increment the counter
+
+ if image_counter >= 1: # Stop after processing 5 images
+ break
+
+
+
+
diff --git a/HerbariaOCR/LLM_Evaluation.py b/HerbariaOCR/LLM_Evaluation.py
new file mode 100644
index 0000000..ff7d092
--- /dev/null
+++ b/HerbariaOCR/LLM_Evaluation.py
@@ -0,0 +1,27 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_LLM_Evaluation.ipynb.
+
+# %% auto 0
+__all__ = ['word2vec', 'cosdis']
+
+# %% ../nbs/01_LLM_Evaluation.ipynb 49
+# utility functions for finding cosine similarity
+
+def word2vec(word):
+ from collections import Counter
+ from math import sqrt
+
+ # count the characters in word
+ cw = Counter(word)
+ # precomputes a set of the different characters
+ sw = set(cw)
+ # precomputes the "length" of the word vector
+ lw = sqrt(sum(c*c for c in cw.values()))
+
+ # return a tuple
+ return cw, sw, lw
+
+def cosdis(v1, v2):
+ # which characters are common to the two words?
+ common = v1[1].intersection(v2[1])
+ # by definition of cosine distance we have
+ return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]
diff --git a/HerbariaOCR/LLM_Evaluation_Cyrillic.py b/HerbariaOCR/LLM_Evaluation_Cyrillic.py
new file mode 100644
index 0000000..649f56b
--- /dev/null
+++ b/HerbariaOCR/LLM_Evaluation_Cyrillic.py
@@ -0,0 +1,28 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/03_LLM_Evaluation_Cyrilic.ipynb.
+
+# %% auto 0
+__all__ = ['word2vec', 'cosdis']
+
+# %% ../nbs/03_LLM_Evaluation_Cyrilic.ipynb 49
+# utility functions for finding cosine similarity
+
+def word2vec(word):
+ from collections import Counter
+ from math import sqrt
+
+ # count the characters in word
+ # print(word)
+ cw = Counter(word)
+ # precomputes a set of the different characters
+ sw = set(cw)
+ # precomputes the "length" of the word vector
+ lw = sqrt(sum(c*c for c in cw.values()))
+
+ # return a tuple
+ return cw, sw, lw
+
+def cosdis(v1, v2):
+ # which characters are common to the two words?
+ common = v1[1].intersection(v2[1])
+ # by definition of cosine distance we have
+ return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]
diff --git a/HerbariaOCR/_modidx.py b/HerbariaOCR/_modidx.py
index 450eb1b..5a64980 100644
--- a/HerbariaOCR/_modidx.py
+++ b/HerbariaOCR/_modidx.py
@@ -5,5 +5,41 @@
'doc_host': 'https://BU-Spark.github.io',
'git_url': 'https://github.com/BU-Spark/HerbariaOCR',
'lib_path': 'HerbariaOCR'},
- 'syms': { 'HerbariaOCR.core': { 'HerbariaOCR.core.foo': ('core.html#foo', 'HerbariaOCR/core.py'),
+ 'syms': { 'HerbariaOCR.AzureVision': { 'HerbariaOCR.AzureVision.analyze_read': ( 'azurevision.html#analyze_read',
+ 'HerbariaOCR/AzureVision.py'),
+ 'HerbariaOCR.AzureVision.analyze_text_density_and_crop': ( 'azurevision.html#analyze_text_density_and_crop',
+ 'HerbariaOCR/AzureVision.py'),
+ 'HerbariaOCR.AzureVision.calculate_scale': ( 'azurevision.html#calculate_scale',
+ 'HerbariaOCR/AzureVision.py'),
+ 'HerbariaOCR.AzureVision.combine_text_regions': ( 'azurevision.html#combine_text_regions',
+ 'HerbariaOCR/AzureVision.py'),
+ 'HerbariaOCR.AzureVision.crop_image_to_text': ( 'azurevision.html#crop_image_to_text',
+ 'HerbariaOCR/AzureVision.py'),
+ 'HerbariaOCR.AzureVision.draw_boxes': ( 'azurevision.html#draw_boxes',
+ 'HerbariaOCR/AzureVision.py'),
+ 'HerbariaOCR.AzureVision.extract_info': ( 'azurevision.html#extract_info',
+ 'HerbariaOCR/AzureVision.py'),
+ 'HerbariaOCR.AzureVision.find_highest_density_area': ( 'azurevision.html#find_highest_density_area',
+ 'HerbariaOCR/AzureVision.py'),
+ 'HerbariaOCR.AzureVision.format_bounding_box': ( 'azurevision.html#format_bounding_box',
+ 'HerbariaOCR/AzureVision.py'),
+ 'HerbariaOCR.AzureVision.get_text_bounding_boxes': ( 'azurevision.html#get_text_bounding_boxes',
+ 'HerbariaOCR/AzureVision.py'),
+ 'HerbariaOCR.AzureVision.get_text_density_map': ( 'azurevision.html#get_text_density_map',
+ 'HerbariaOCR/AzureVision.py'),
+ 'HerbariaOCR.AzureVision.model_function': ( 'azurevision.html#model_function',
+ 'HerbariaOCR/AzureVision.py'),
+ 'HerbariaOCR.AzureVision.parse_document_content': ( 'azurevision.html#parse_document_content',
+ 'HerbariaOCR/AzureVision.py'),
+ 'HerbariaOCR.AzureVision.sanitize_filename': ( 'azurevision.html#sanitize_filename',
+ 'HerbariaOCR/AzureVision.py')},
+ 'HerbariaOCR.LLM_Evaluation': { 'HerbariaOCR.LLM_Evaluation.cosdis': ( 'llm_evaluation.html#cosdis',
+ 'HerbariaOCR/LLM_Evaluation.py'),
+ 'HerbariaOCR.LLM_Evaluation.word2vec': ( 'llm_evaluation.html#word2vec',
+ 'HerbariaOCR/LLM_Evaluation.py')},
+ 'HerbariaOCR.LLM_Evaluation_Cyrillic': { 'HerbariaOCR.LLM_Evaluation_Cyrillic.cosdis': ( 'llm_evaluation_cyrilic.html#cosdis',
+ 'HerbariaOCR/LLM_Evaluation_Cyrillic.py'),
+ 'HerbariaOCR.LLM_Evaluation_Cyrillic.word2vec': ( 'llm_evaluation_cyrilic.html#word2vec',
+ 'HerbariaOCR/LLM_Evaluation_Cyrillic.py')},
+ 'HerbariaOCR.core': { 'HerbariaOCR.core.foo': ('core.html#foo', 'HerbariaOCR/core.py'),
'HerbariaOCR.core.say_hello': ('core.html#say_hello', 'HerbariaOCR/core.py')}}}
diff --git a/README.md b/README.md
index 20fe813..5e19e54 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,8 @@
+### *Smriti Suresh, Dima Kazlouski, Douglas Moy - 2023-10-06 v1.0.0-dev*
+
Note: The file `README.md` is generated from `nbs/index.ipynb`. Do not
edit `README.md` directly, but rather edit `nbs/index.ipynb`. It is also
rendered as the front page to the project documentation.
@@ -9,6 +11,30 @@ rendered as the front page to the project documentation.
See [project documentation](https://bu-spark.github.io/HerbariaOCR/) for
the rendered documentation output.
+## Overview
+
+The changing climate increases stressors that weaken plant resilience,
+disrupting forest structure and ecosystem services. Rising temperatures
+lead to more frequent droughts, wildfires, and invasive pest outbreaks,
+leading to the loss of plant species. That has numerous detrimental
+effects, including lowered productivity, the spread of invasive plants,
+vulnerability to pests, altered ecosystem structure, etc. The project
+aims to aid climate scientists in capturing patterns in plant life
+concerning changing climate. The herbarium specimens are pressed plant
+samples stored on paper. The specimen labels are handwritten and date
+back to the early 1900s. The labels contain the curator’s name, their
+institution, the species and genus, and the date the specimen was
+collected. Since the labels are handwritten, they are not readily
+accessible from an analytical standpoint. The data, at this time, cannot
+be analyzed to study the impact of climate on plant life. The digitized
+samples are an invaluable source of information for climate change
+scientists, and are providing key insights into biodiversity change over
+the last century. Digitized specimens will facilitate easier
+dissemination of information and allow more people access to data. The
+project, if successful, would enable users from various domains in
+environmental science to further studies pertaining to climate change
+and its effects on flora and even fauna.
+
## Install
``` sh
@@ -17,13 +43,14 @@ pip install HerbariaOCR
## How to use
-Fill me in please! Don’t forget code examples:
+Refer to the EDA tab for exploring the Herbaria OCR data sources
-``` python
-1+1
-```
+Refer to the Azure Vision tab for exploring the implementation of the
+OCR pipeline to obtain results from the models.
- 2
+Refer to the LLM Evaluations tabs for exploring the Evaluation metrics
+used as well as Accuracy results for English, Cyrillic and Chinese
+samples respectively.
## Contributing
diff --git a/nbs/.notest b/nbs/.notest
new file mode 100644
index 0000000..e69de29
diff --git a/nbs/00_AzureVision.ipynb b/nbs/00_AzureVision.ipynb
new file mode 100644
index 0000000..9de01ea
--- /dev/null
+++ b/nbs/00_AzureVision.ipynb
@@ -0,0 +1,1105 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "70e4f63d-d65f-45b9-8a89-a49dc0a2e0ba",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| default_exp AzureVision"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2f17ba8c",
+ "metadata": {},
+ "source": [
+ "# Azure Vision Implementaion - Dima "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dd18180f",
+ "metadata": {},
+ "source": [
+ "This notebook utilizes Azure AI Document Intelligence Studio to extract text from a set of Herbarium specimens. There was a previous issue with high quality images being too large for Azure to process, this has been fixed through the resize_image function in this notebook that converts all images to 4mb or less. We have now been able to resize all images in the /projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/TROCR_Training/goodfiles/ folder, same folder used by the previous semester's group testing their TROCR models. \n",
+ "\n",
+ "Currently: the notebook takes an input image from: /projectnb/sparkgrp/ml-herbarium-grp/fall2023/LLM_Specimens, runs it through Azure Vision, analyzes all text, creates a pdf with the original image, an annotated image that has boxes around identified words and predicted words written over the original text. Below the image the entire text identified is printed along with the confidence score for each identified term. All this is saved and stored in: /projectnb/sparkgrp/ml-herbarium-grp/fall2023/AzureVision-results.\n",
+ "\n",
+ "-Previously there was a text recognition issue with images that have both text and the plant itself, this has been resolved.\n",
+ "-Experimentation was conducted to see if a custom Azure AI model could be trained to extract Taxon, collector, date, and geography data, this proved to be inefficient and the quality was poor.\n",
+ "-As a result the model will focus on extracting all the text from each image and using Open AI's ChatGPT to process the text into a Darwin JSON format. \n",
+ "\n",
+ "We are also looking into validating our results. We are looking into the validation dataset used by the group that created the TROCR model and we are also considering creating our own validation set. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dff2d218",
+ "metadata": {},
+ "source": [
+ "For the sake of presentation we are also looking into creating a simple user friendly demo app that will enable user to input a Herbarium sample, press a button, and see the processed result- the Taxon, Collection Date, Collector Name, and the Geography. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bc1c7278",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| hide\n",
+ "#!pip install azure-ai-formrecognizer --pre\n",
+ "#!pip install opencv-python-headless matplotlib\n",
+ "#!pip install matplotlib pillow\n",
+ "#!pip install ipywidgets\n",
+ "#!pip install shapely\n",
+ "#!pip install openai\n",
+ "#!pip install reportlab"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "773366f2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| hide\n",
+ "from PIL import Image\n",
+ "import os\n",
+ "from azure.core.credentials import AzureKeyCredential\n",
+ "from azure.ai.formrecognizer import DocumentAnalysisClient\n",
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib.image as mpimg\n",
+ "from PIL import Image, ImageDraw, ImageFont\n",
+ "import openai\n",
+ "import re\n",
+ "import os\n",
+ "from reportlab.lib.pagesizes import letter\n",
+ "from reportlab.pdfgen import canvas\n",
+ "from reportlab.lib.units import inch\n",
+ "# import gradio as gr\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "179830cd",
+ "metadata": {},
+ "source": [
+ "# Resizing images to smaller size for API to accept them"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3808c408-6b31-4e09-ac0b-96d0945b4ff9",
+ "metadata": {},
+ "source": [
+ "#| export\n",
+ "\n",
+ "def resize_image(input_path, output_path, max_size_mb, quality=85):\n",
+ "\n",
+ " \"\"\"\n",
+ " Resize the image found at input_path and save it to output_path.\n",
+ " The image is resized to be under max_size_mb megabytes.\n",
+ " \"\"\"\n",
+ " \n",
+ " # Load the image\n",
+ " with Image.open(input_path) as img:\n",
+ " # Calculate target size to maintain aspect ratio\n",
+ " ratio = img.width / img.height\n",
+ " target_width = int((max_size_mb * 1024 * 1024 * ratio) ** 0.5)\n",
+ " target_height = int(target_width / ratio)\n",
+ "\n",
+ " # Resize the image\n",
+ " resized_img = img.resize((target_width, target_height), Image.Resampling.LANCZOS)\n",
+ "\n",
+ " # Save the resized image\n",
+ " resized_img.save(output_path, quality=quality)\n",
+ "\n",
+ "input_folder = 'data/goodfiles/'\n",
+ "\n",
+ "output_folder = 'data/resized-images/'\n",
+ "\n",
+ "if not os.path.exists(output_folder):\n",
+ "\n",
+ " os.makedirs(output_folder)\n",
+ "\n",
+ "for file_name in os.listdir(input_folder):\n",
+ "\n",
+ " file_path = os.path.join(input_folder, file_name)\n",
+ "\n",
+ " output_file_path = os.path.join(output_folder, file_name)\n",
+ "\n",
+ " # Check if the file is an image\n",
+ "\n",
+ " if file_path.lower().endswith(('.png', '.jpg', '.jpeg')):\n",
+ " \n",
+ " resize_image(file_path, output_file_path, max_size_mb=4) \n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "58cebe57",
+ "metadata": {},
+ "source": [
+ "The code below sets up a connection to Azure Cognitive Services for document analysis, and includes functions for sanitizing filenames and formatting bounding boxes.\n",
+ "\n",
+ "It defines a function to annotate images with extracted text and bounding boxes, and another function to parse document content using GPT-4.\n",
+ "\n",
+ "The main function, analyze_read, reads images, extracts text using Azure, annotates these images, and creates a PDF report that includes both the original and annotated images, along with the extracted text.\n",
+ "\n",
+ "The results are saved in /projectnb/sparkgrp/ml-herbarium-grp/fall2023/AzureVision-results"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9eef5e22",
+ "metadata": {},
+ "source": [
+ "# Extracting Relevant Info w/ GPT-4 Turbo"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2ad5cf1a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "```json\n",
+ "{\n",
+ " \"occurrenceID\": \"00592128\",\n",
+ " \"scientificName\": \"Atriplex patula L. var. hastata Gray\",\n",
+ " \"recordedBy\": \"Kate Furbish\",\n",
+ " \"country\": \"USA\",\n",
+ " \"stateProvince\": \"Maine\",\n",
+ " \"county\": \"Washington\",\n",
+ " \"locality\": \"North Lubec\",\n",
+ " \"eventDate\": \"1902-09-13\",\n",
+ " \"institutionCode\": \"Harvard University Herbaria\",\n",
+ " \"catalogNumber\": \"00592128\",\n",
+ " \"basisOfRecord\": \"PreservedSpecimen\",\n",
+ " \"rightsHolder\": \"President and Fellows of Harvard College\"\n",
+ "}\n",
+ "```\n",
+ "Below is the JSON representation of the relevant Darwin Core fields extracted from the provided text. The Darwin Core standard has many possible fields, but I've put together the ones that can be inferred from the text you've supplied:\n",
+ "\n",
+ "```json\n",
+ "{\n",
+ " \"type\": \"SYNTYPE\",\n",
+ " \"scientificName\": \"Glaux maritima var. obtusifolia Fernald\",\n",
+ " \"acceptedNameUsage\": \"Glaux maritima L.\",\n",
+ " \"verbatimEventDate\": \"July 24, 1902\",\n",
+ " \"recordNumber\": \"00936578, 00936579\",\n",
+ " \"locality\": \"Brackish marsh, St. Andrews, BATHURST, GLOUCESTER COUNTY\",\n",
+ " \"stateProvince\": \"New Brunswick\",\n",
+ " \"country\": \"Canada\",\n",
+ " \"identifiedBy\": \"Walter T. Kittredge\",\n",
+ " \"dateIdentified\": \"2019\",\n",
+ " \"recordedBy\": \"E. F. Williams, M. L. Fernald\",\n",
+ " \"institutionCode\": \"Harvard University Herbaria\",\n",
+ " \"catalogNumber\": \"Not provided (use recordNumber instead)\",\n",
+ " \"publicationCitation\": \"Fernald, Rhodora 4: 215. 1902\",\n",
+ " \"basisOfRecord\": \"PreservedSpecimen\",\n",
+ " \"occurrenceID\": \"Not provided\",\n",
+ " \"higherGeography\": \"North America\",\n",
+ " \"coordinateUncertaintyInMeters\": \"Not provided\"\n",
+ "}\n",
+ "```\n",
+ "\n",
+ "Please note that some fields usually found in Darwin Core records such as `occurrenceID` and an exact `catalogNumber` cannot be reliably derived from the text provided. Additionally, the common name and higher taxonomy are not specified in the text. Where confidence levels were low or information could not be precisely determined from the text (e.g., exact coordinates, altitude), the \"Not provided\" value is used to indicate missing data.\n",
+ "```json\n",
+ "{\n",
+ " \"occurrenceID\": \"3040830\",\n",
+ " \"institutionCode\": \"NATIONAL HERBARIUM UNITED STATES\",\n",
+ " \"collectionCode\": \"PLANTS OF ILLINOIS\",\n",
+ " \"catalogNumber\": \"04385325\",\n",
+ " \"scientificName\": \"Sporobolus clandestinus (Spreng) Hitche\",\n",
+ " \"country\": \"US\",\n",
+ " \"stateProvince\": \"Illinois\",\n",
+ " \"county\": \"Henderson\",\n",
+ " \"locality\": \"Sandy bluff of Mississippi river, north of Oquawka\",\n",
+ " \"eventDate\": \"1941-08-27\",\n",
+ " \"recordedBy\": \"D. E. AND M. S. EYLES\",\n",
+ " \"recordNumber\": \"389\",\n",
+ " \"decimalLatitude\": \"\",\n",
+ " \"decimalLongitude\": \"\",\n",
+ " \"coordinateUncertaintyInMeters\": \"\",\n",
+ " \"identifiedBy\": \"\",\n",
+ " \"dateIdentified\": \"\",\n",
+ " \"basisOfRecord\": \"PreservedSpecimen\",\n",
+ " \"individualCount\": \"\",\n",
+ " \"preparations\": \"\"\n",
+ "}\n",
+ "```\n",
+ "\n",
+ "**Note**: \n",
+ "\n",
+ "- The `decimalLatitude` and `decimalLongitude` fields did not contain valid entries in the provided text that could be extracted with confidence. Hence, they were left empty.\n",
+ "- Similarly, no data was given for the `coordinateUncertaintyInMeters`, `identifiedBy`, `dateIdentified`, `individualCount`, and `preparations`. These fields must be populated with relevant data if available, or left empty/blanks as shown.\n",
+ "- The format of `eventDate` was standardized to an ISO 8601 format (YYYY-MM-DD).\n",
+ "- `basisOfRecord` was assumed to be \"PreservedSpecimen\" as the standard for physical specimens. If this is not the case, it should be adjusted accordingly based on the exact nature of the record.\n",
+ "\n",
+ "Please make sure to validate the specifics (e.g., precise latitude/longitude, identifier's credentials, date of identification, etc.) and fill in any missing or uncertain parts from the original data if necessary.\n",
+ "```json\n",
+ "{\n",
+ " \"occurrenceID\": \"03587160\",\n",
+ " \"institutionCode\": \"Smithsonian Institution\",\n",
+ " \"collectionCode\": \"United States National Herbarium\",\n",
+ " \"basisOfRecord\": \"PreservedSpecimen\",\n",
+ " \"scientificName\": \"Nasturtium palustre (L.) DC.\",\n",
+ " \"acceptedNameUsage\": \"Rorippa islandica (Oeder ex Murray) Borbás\",\n",
+ " \"family\": \"Brassicaceae\",\n",
+ " \"eventDate\": \"1965-05-22\",\n",
+ " \"year\": \"1965\",\n",
+ " \"month\": \"5\",\n",
+ " \"day\": \"22\",\n",
+ " \"country\": \"United States of America\",\n",
+ " \"stateProvince\": \"Nebraska\",\n",
+ " \"county\": \"Cedar Co.\",\n",
+ " \"recordedBy\": \"Fred Clements\",\n",
+ " \"identifiedBy\": \"Ronald L. Stuckey\",\n",
+ " \"catalogNumber\": \"2618\",\n",
+ " \"otherCatalogNumbers\": \"219017\",\n",
+ " \"decimalLatitude\": \"\",\n",
+ " \"decimalLongitude\": \"\",\n",
+ " \"coordinateUncertaintyInMeters\": \"\",\n",
+ " \"verbatimCoordinates\": \"\",\n",
+ " \"geodeticDatum\": \"\",\n",
+ " \"verbatimSRS\": \"\",\n",
+ " \"informationWithheld\": \"\",\n",
+ " \"dataGeneralizations\": \"\",\n",
+ " \"dynamicProperties\": \"\"\n",
+ "}\n",
+ "```\n",
+ "\n",
+ "Note: Coordinates and some other specific details like uncertainty and datum were not included in the provided text, so corresponding fields are left blank. The accepted name usage is assumed as per the identifier's determination. The collector's full name appears to be 'Fred Clements' based on the different notations. The confidence values provided do not affect the Darwin Core formatted data unless it specifically impacts the interpretation of relevant fields.\n",
+ "```json\n",
+ "{\n",
+ " \"occurrenceID\": \"1080232\",\n",
+ " \"institutionCode\": \"University of Minnesota\",\n",
+ " \"collectionCode\": \"Herbarium\",\n",
+ " \"recordedBy\": \"Sharon S. & Arthur O. Tucker\",\n",
+ " \"recordNumber\": \"181466\",\n",
+ " \"year\": \"1993\",\n",
+ " \"scientificName\": \"Blephilia hirsuta (Pursh) Benth.\",\n",
+ " \"locality\": \"Houston Co.\",\n",
+ " \"eventDate\": \"1899-07-11\",\n",
+ " \"country\": \"USA\",\n",
+ " \"stateProvince\": \"Minnesota\",\n",
+ " \"county\": \"Houston\",\n",
+ " \"catalogNumber\": \"181466\",\n",
+ " \"identifiedBy\": \"H.S. Lyon\"\n",
+ "}\n",
+ "```\n"
+ ]
+ }
+ ],
+ "source": [
+ "#| export\n",
+ "# Azure Cognitive Services endpoint and key\n",
+ "endpoint = \"https://herbariumsamplerecognition.cognitiveservices.azure.com/\"\n",
+ "key = \"your key here\"\n",
+ "\n",
+ "def sanitize_filename(filename):\n",
+ " # Remove characters that are not alphanumeric, spaces, dots, or underscores\n",
+ " return re.sub(r'[^\\w\\s\\.-]', '', filename)\n",
+ "\n",
+ "def format_bounding_box(bounding_box):\n",
+ " if not bounding_box:\n",
+ " return \"N/A\"\n",
+ " return \", \".join([\"[{}, {}]\".format(p.x, p.y) for p in bounding_box])\n",
+ "\n",
+ "\n",
+ "def draw_boxes(image_path, words):\n",
+ " original_image = Image.open(image_path)\n",
+ " annotated_image = original_image.copy()\n",
+ " draw = ImageDraw.Draw(annotated_image)\n",
+ "\n",
+ " for word in words:\n",
+ " polygon = word['polygon']\n",
+ " if polygon:\n",
+ " bbox = [(point.x, point.y) for point in polygon]\n",
+ " try:\n",
+ " # Replace special characters that cannot be encoded in 'latin-1'\n",
+ " text_content = word['content'].encode('ascii', 'ignore').decode('ascii')\n",
+ " except Exception as e:\n",
+ " print(f\"Error processing text {word['content']}: {e}\")\n",
+ " text_content = \"Error\"\n",
+ " draw.polygon(bbox, outline=\"red\")\n",
+ " draw.text((bbox[0][0], bbox[0][1]), text_content, fill=\"green\")\n",
+ "\n",
+ " \n",
+ " return annotated_image\n",
+ "\n",
+ "page_width, page_height = letter \n",
+ "\n",
+ "# Function to calculate scale to fit the image within page dimensions\n",
+ "def calculate_scale(image, max_width, max_height):\n",
+ " scale_w = max_width / image.width\n",
+ " scale_h = max_height / image.height\n",
+ " return min(scale_w, scale_h)\n",
+ "\n",
+ "\n",
+ "def extract_info(text):\n",
+ " # Set your OpenAI API key\n",
+ " openai.api_key = 'your key here'\n",
+ "\n",
+ " # Prepare the prompt for the API\n",
+ " prompt = f\"From the provided text, return only the relevant information in a JSON format according to the Darwin Core standard for biodiversity specimen:\\n{text}\"\n",
+ "\n",
+ " try:\n",
+ " # Send the request to the API\n",
+ " response = openai.ChatCompletion.create(\n",
+ " model=\"gpt-4-1106-preview\",\n",
+ " messages=[{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, \n",
+ " {\"role\": \"user\", \"content\": prompt}]\n",
+ " )\n",
+ "\n",
+ " # Extract the response\n",
+ " if response.choices:\n",
+ " return response.choices[0].message['content']\n",
+ " else:\n",
+ " return \"No response from the API.\"\n",
+ "\n",
+ " except Exception as e:\n",
+ " return f\"An error occurred: {str(e)}\"\n",
+ "\n",
+ "\n",
+ "def analyze_read(image_path, output_path):\n",
+ " try:\n",
+ " with open(image_path, \"rb\") as f:\n",
+ " image_stream = f.read()\n",
+ "\n",
+ " document_analysis_client = DocumentAnalysisClient(\n",
+ " endpoint=endpoint, credential=AzureKeyCredential(key)\n",
+ " )\n",
+ "\n",
+ " poller = document_analysis_client.begin_analyze_document(\n",
+ " \"prebuilt-read\", image_stream)\n",
+ " result = poller.result()\n",
+ "\n",
+ " # Collect words, their polygon data, and confidence\n",
+ " words = []\n",
+ " confidence_text = \"\"\n",
+ " for page in result.pages:\n",
+ " for word in page.words:\n",
+ " words.append({\n",
+ " 'content': word.content,\n",
+ " 'polygon': word.polygon\n",
+ " })\n",
+ " confidence_text += \"'{}' confidence {}\\n\".format(word.content, word.confidence)\n",
+ "\n",
+ " document_content = result.content + \"\\n\\nConfidence Metrics:\\n\" + confidence_text\n",
+ " extracted_info = extract_info(document_content)\n",
+ " print(extracted_info)\n",
+ " original_image = Image.open(image_path)\n",
+ " annotated_img = draw_boxes(image_path, words)\n",
+ " \n",
+ "\n",
+ " # Set up PDF\n",
+ " output_filename = os.path.join(output_path, sanitize_filename(os.path.basename(image_path).replace('.jpg', '.pdf')))\n",
+ " c = canvas.Canvas(output_filename, pagesize=letter)\n",
+ " width, height = letter # usually 612 x 792\n",
+ "\n",
+ " # Draw original image\n",
+ " if original_image.height <= height:\n",
+ " c.drawImage(image_path, 0, height - original_image.height, width=original_image.width, height=original_image.height, mask='auto')\n",
+ " y_position = height - original_image.height\n",
+ " else:\n",
+ " # Handle large images or add scaling logic here\n",
+ " pass\n",
+ "\n",
+ " \n",
+ " # Draw original image\n",
+ " scale = calculate_scale(original_image, page_width, page_height)\n",
+ " img_width, img_height = original_image.width * scale, original_image.height * scale\n",
+ " c.drawImage(image_path, 0, page_height - img_height, width=img_width, height=img_height, mask='auto')\n",
+ " y_position = page_height - img_height\n",
+ "\n",
+ " # Draw annotated image\n",
+ " annotated_image_path = '/tmp/annotated_image.png'\n",
+ " annotated_img.save(annotated_image_path)\n",
+ " scale = calculate_scale(annotated_img, page_width, page_height)\n",
+ " annotated_img_width, annotated_img_height = annotated_img.width * scale, annotated_img.height * scale\n",
+ " if y_position - annotated_img_height >= 0:\n",
+ " c.drawImage(annotated_image_path, 0, y_position - annotated_img_height, width=annotated_img_width, height=annotated_img_height, mask='auto')\n",
+ " else:\n",
+ " c.showPage()\n",
+ " c.drawImage(annotated_image_path, 0, page_height - annotated_img_height, width=annotated_img_width, height=annotated_img_height, mask='auto')\n",
+ " \n",
+ "\n",
+ " # Add text\n",
+ " textobject = c.beginText()\n",
+ " textobject.setTextOrigin(10, y_position - 15)\n",
+ " textobject.setFont(\"Times-Roman\", 12)\n",
+ "\n",
+ " for line in document_content.split('\\n'):\n",
+ " if textobject.getY() - 15 < 0: # Check if new page is needed for more text\n",
+ " c.drawText(textobject)\n",
+ " c.showPage()\n",
+ " textobject = c.beginText()\n",
+ " textobject.setTextOrigin(10, height - 15)\n",
+ " textobject.setFont(\"Times-Roman\", 12)\n",
+ " textobject.textLine(line)\n",
+ " \n",
+ " c.drawText(textobject)\n",
+ " c.save()\n",
+ "\n",
+ " except Exception as e:\n",
+ " print(f\"An error occurred while processing {image_path}: {e}\")\n",
+ "\n",
+ "\n",
+ "if __name__ == \"__main__\":\n",
+ " \n",
+ " input_folder = 'data/resized-images/'\n",
+ " output_folder = 'data/temp/'\n",
+ " # Create the output folder if it doesn't exist\n",
+ " if not os.path.exists(output_folder):\n",
+ " os.makedirs(output_folder)\n",
+ "\n",
+ " image_counter = 0 # Initialize a counter for the number of images processed\n",
+ "\n",
+ " # Iterate over each image in the input folder\n",
+ " for image_file in os.listdir(input_folder):\n",
+ " image_path = os.path.join(input_folder, image_file)\n",
+ " \n",
+ " # Check if the file is an image\n",
+ " if image_path.lower().endswith(('.png', '.jpg', '.jpeg')):\n",
+ " analyze_read(image_path, output_folder)\n",
+ " image_counter += 1 # Increment the counter\n",
+ "\n",
+ " if image_counter >= 5: # Stop after processing x images\n",
+ " break\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bb317d2a",
+ "metadata": {},
+ "source": [
+ "# Saving as a .txt instead of a PDF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b6488535",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| export\n",
+ "# Azure Cognitive Services endpoint and key\n",
+ "endpoint = \"https://herbariumsamplerecognition.cognitiveservices.azure.com/\"\n",
+ "key = \"your key here\"\n",
+ "\n",
+ "def sanitize_filename(filename):\n",
+ " # Remove characters that are not alphanumeric, spaces, dots, or underscores\n",
+ " return re.sub(r'[^\\w\\s\\.-]', '', filename)\n",
+ "\n",
+ "def extract_info(text):\n",
+ " # Set your OpenAI API key\n",
+ " openai.api_key = 'your key here'\n",
+ "\n",
+ " # Prepare the prompt for the API\n",
+ " prompt = f\"From the provided text, return only the relevant information in a JSON format according to the Darwin Core standard for biodiversity specimen. Note: make sure that each output has a 'country' field. If you do not find an explicit country, make your best guess at the country using the context of the other text.\\n{text}\\n{text}\"\n",
+ "\n",
+ " try:\n",
+ " # Send the request to the API\n",
+ " response = openai.ChatCompletion.create(\n",
+ " model=\"gpt-4-1106-preview\",\n",
+ " messages=[{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, \n",
+ " {\"role\": \"user\", \"content\": prompt}]\n",
+ " )\n",
+ "\n",
+ " # Extract the response\n",
+ " return response.choices[0].message['content'] if response.choices else \"No response from the API.\"\n",
+ "\n",
+ " except Exception as e:\n",
+ " return f\"An error occurred: {str(e)}\"\n",
+ "\n",
+ "def analyze_read(image_path, output_path):\n",
+ " try:\n",
+ " with open(image_path, \"rb\") as f:\n",
+ " image_stream = f.read()\n",
+ "\n",
+ " document_analysis_client = DocumentAnalysisClient(\n",
+ " endpoint=endpoint, credential=AzureKeyCredential(key)\n",
+ " )\n",
+ "\n",
+ " poller = document_analysis_client.begin_analyze_document(\n",
+ " \"prebuilt-read\", image_stream)\n",
+ " result = poller.result()\n",
+ "\n",
+ " # Collect the content from the document\n",
+ " document_content = result.content\n",
+ " extracted_info = extract_info(document_content)\n",
+ "\n",
+ " # Save the extracted information to a text file\n",
+ " output_filename = os.path.join(output_path, sanitize_filename(os.path.basename(image_path).replace('.jpg', '.txt')))\n",
+ " with open(output_filename, 'w') as text_file:\n",
+ " text_file.write(extracted_info)\n",
+ "\n",
+ " except Exception as e:\n",
+ " print(f\"An error occurred while processing {image_path}: {e}\")\n",
+ "\n",
+ "if __name__ == \"__main__\":\n",
+ " \n",
+ " input_folder = 'data/resized-images/'\n",
+ " output_folder = 'data/AzureVisionResults/'\n",
+ " # Create the output folder if it doesn't exist\n",
+ " if not os.path.exists(output_folder):\n",
+ " os.makedirs(output_folder)\n",
+ "\n",
+ " image_counter = 0 # Initialize a counter for the number of images processed\n",
+ "\n",
+ " # Iterate over each image in the input folder\n",
+ " for image_file in os.listdir(input_folder):\n",
+ " image_path = os.path.join(input_folder, image_file)\n",
+ " \n",
+ " # Check if the file is an image\n",
+ " if image_path.lower().endswith(('.png', '.jpg', '.jpeg')):\n",
+ " analyze_read(image_path, output_folder)\n",
+ " image_counter += 1 # Increment the counter\n",
+ "\n",
+ " if image_counter >= 224: # Stop after processing x images\n",
+ " break\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e8f69d7b",
+ "metadata": {},
+ "source": [
+ "# Making A UI"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b34619cc",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "4.8.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "#| hide\n",
+ "#!pip install --upgrade gradio\n",
+ "print(gr.__version__)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bccbd17e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Running on local URL: http://127.0.0.1:7866\n",
+ "Running on public URL: https://2f755593139e8ed224.gradio.live\n",
+ "\n",
+ "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": []
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#| export\n",
+ "# Azure Cognitive Services endpoint and key\n",
+ "endpoint = \"https://herbariumsamplerecognition.cognitiveservices.azure.com/\"\n",
+ "key = \"your key here\"\n",
+ "\n",
+ "\n",
+ "def sanitize_filename(filename):\n",
+ " # Remove characters that are not alphanumeric, spaces, dots, or underscores\n",
+ " return re.sub(r'[^\\w\\s\\.-]', '', filename)\n",
+ "\n",
+ "def extract_info(text):\n",
+ " # Set your OpenAI API key\n",
+ " openai.api_key = 'your key here'\n",
+ "\n",
+ " # Prepare the prompt for the API\n",
+ " prompt = f\"From the provided text, return only the relevant information in a JSON format according to the Darwin Core standard for biodiversity specimen. Note: make sure that each output has a 'country' field. If you do not find an explicit country, make your best guess at the country using the context of the other text.\\n{text}\\n{text}\"\n",
+ "\n",
+ " try:\n",
+ " # Send the request to the API\n",
+ " response = openai.ChatCompletion.create(\n",
+ " model=\"gpt-4-1106-preview\",\n",
+ " messages=[{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, \n",
+ " {\"role\": \"user\", \"content\": prompt}]\n",
+ " )\n",
+ "\n",
+ " # Extract the response\n",
+ " return response.choices[0].message['content'] if response.choices else \"No response from the API.\"\n",
+ "\n",
+ " except Exception as e:\n",
+ " return f\"An error occurred: {str(e)}\"\n",
+ "\n",
+ "def analyze_read(image_stream):\n",
+ " try:\n",
+ " document_analysis_client = DocumentAnalysisClient(\n",
+ " endpoint=endpoint, credential=AzureKeyCredential(key)\n",
+ " )\n",
+ "\n",
+ " poller = document_analysis_client.begin_analyze_document(\n",
+ " \"prebuilt-read\", image_stream)\n",
+ " result = poller.result()\n",
+ "\n",
+ " # Collect the content from the document\n",
+ " document_content = result.content\n",
+ " extracted_info = extract_info(document_content)\n",
+ "\n",
+ " return extracted_info\n",
+ "\n",
+ " except Exception as e:\n",
+ " return f\"An error occurred: {str(e)}\"\n",
+ "\n",
+ "def model_function(image):\n",
+ " # Convert the NumPy array to a PIL Image object\n",
+ " image = Image.fromarray(np.uint8(image)).convert('RGB')\n",
+ "\n",
+ " # Convert the uploaded image to a byte stream\n",
+ " image_bytes = io.BytesIO()\n",
+ " image.save(image_bytes, format='JPEG') # Using 'JPEG' as the format\n",
+ " image_bytes = image_bytes.getvalue()\n",
+ "\n",
+ " output_text = analyze_read(image_bytes)\n",
+ " return output_text\n",
+ "\n",
+ "# iface = gr.Interface(fn=model_function, inputs=\"image\", outputs=\"text\")\n",
+ "# iface.launch(share=True)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e791a84b",
+ "metadata": {},
+ "source": [
+ "# Experimenting With Cropping Images\n",
+ "\n",
+ "So far results have been better for images that have not been cropped (cropping function needs improvement)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0998c751",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| export\n",
+ "# Azure Cognitive Services endpoint and key\n",
+ "endpoint = \"https://herbariumsamplerecognition.cognitiveservices.azure.com/\"\n",
+ "key = \"AZURE KEY HERE\"\n",
+ "\n",
+ "def sanitize_filename(filename):\n",
+ " # Remove characters that are not alphanumeric, spaces, dots, or underscores\n",
+ " return re.sub(r'[^\\w\\s\\.-]', '', filename)\n",
+ "\n",
+ "def format_bounding_box(bounding_box):\n",
+ " if not bounding_box:\n",
+ " return \"N/A\"\n",
+ " return \", \".join([\"[{}, {}]\".format(p.x, p.y) for p in bounding_box])\n",
+ "\n",
+ "def draw_boxes(image_path, words):\n",
+ " original_image = Image.open(image_path)\n",
+ " annotated_image = original_image.copy()\n",
+ " draw = ImageDraw.Draw(annotated_image)\n",
+ "\n",
+ " for word in words:\n",
+ " polygon = word['polygon']\n",
+ " if polygon:\n",
+ " bbox = [(point.x, point.y) for point in polygon]\n",
+ " try:\n",
+ " # Replace special characters that cannot be encoded in 'latin-1'\n",
+ " text_content = word['content'].encode('ascii', 'ignore').decode('ascii')\n",
+ " except Exception as e:\n",
+ " print(f\"Error processing text {word['content']}: {e}\")\n",
+ " text_content = \"Error\"\n",
+ " draw.polygon(bbox, outline=\"red\")\n",
+ " draw.text((bbox[0][0], bbox[0][1]), text_content, fill=\"green\")\n",
+ " \n",
+ " return annotated_image\n",
+ "\n",
+ "page_width, page_height = letter \n",
+ "\n",
+ "# Function to calculate scale to fit the image within page dimensions\n",
+ "def calculate_scale(image, max_width, max_height):\n",
+ " scale_w = max_width / image.width\n",
+ " scale_h = max_height / image.height\n",
+ " return min(scale_w, scale_h)\n",
+ "\n",
+ "\n",
+ "def get_text_density_map(pages):\n",
+ " density_map = {}\n",
+ " for page in pages:\n",
+ " for line in page.lines:\n",
+ " points = line.polygon\n",
+ " if points:\n",
+ " x_center = sum(point.x for point in points) / len(points)\n",
+ " y_center = sum(point.y for point in points) / len(points)\n",
+ " density_map[(x_center, y_center)] = density_map.get((x_center, y_center), 0) + 1\n",
+ " return density_map\n",
+ "\n",
+ "def find_highest_density_area(density_map):\n",
+ " # This function will find the center of the area with the highest text density\n",
+ " # For simplicity, this example just returns the center with the highest count\n",
+ " # In a real scenario, you might want to consider a more sophisticated method\n",
+ " # that takes into account the size and proximity of the high-density areas\n",
+ " return max(density_map, key=density_map.get)\n",
+ "\n",
+ "def crop_image_to_text(image_path, density_center, crop_size):\n",
+ " with Image.open(image_path) as img:\n",
+ " # Calculate the coordinates for the crop\n",
+ " left = max(density_center[0] - crop_size[0] // 2, 0)\n",
+ " upper = max(density_center[1] - crop_size[1] // 2, 0)\n",
+ " right = min(density_center[0] + crop_size[0] // 2, img.width)\n",
+ " lower = min(density_center[1] + crop_size[1] // 2, img.height)\n",
+ "\n",
+ " # Debug output\n",
+ " print(f\"Cropping coordinates: left={left}, upper={upper}, right={right}, lower={lower}\")\n",
+ "\n",
+ " # Perform the crop\n",
+ " cropped_img = img.crop((left, upper, right, lower))\n",
+ " return cropped_img\n",
+ " \n",
+ " \n",
+ "def get_text_bounding_boxes(pages):\n",
+ " bounding_boxes = []\n",
+ " for page in pages:\n",
+ " for line in page.lines:\n",
+ " if line.polygon:\n",
+ " box = [(point.x, point.y) for point in line.polygon]\n",
+ " bounding_boxes.append(box)\n",
+ " return bounding_boxes\n",
+ "\n",
+ "def combine_text_regions(image_path, bounding_boxes):\n",
+ " original_image = Image.open(image_path)\n",
+ "\n",
+ " # Calculate the combined bounding box\n",
+ " min_x = min(min(box, key=lambda x: x[0])[0] for box in bounding_boxes)\n",
+ " min_y = min(min(box, key=lambda x: x[1])[1] for box in bounding_boxes)\n",
+ " max_x = max(max(box, key=lambda x: x[0])[0] for box in bounding_boxes)\n",
+ " max_y = max(max(box, key=lambda x: x[1])[1] for box in bounding_boxes)\n",
+ "\n",
+ " # Create a new blank image with integer dimensions\n",
+ " combined_image = Image.new('RGB', (int(max_x - min_x), int(max_y - min_y)), (255, 255, 255))\n",
+ " \n",
+ " for box in bounding_boxes:\n",
+ " cropped_region = original_image.crop((min(box, key=lambda x: x[0])[0], \n",
+ " min(box, key=lambda x: x[1])[1], \n",
+ " max(box, key=lambda x: x[0])[0], \n",
+ " max(box, key=lambda x: x[1])[1]))\n",
+ " # Paste the cropped region at integer coordinates\n",
+ " combined_image.paste(cropped_region, (int(box[0][0] - min_x), int(box[0][1] - min_y)))\n",
+ "\n",
+ " return combined_image\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "###################################################\n",
+ "def parse_document_content(content):\n",
+ " openai.api_key = 'your-api-key'\n",
+ "\n",
+ " try:\n",
+ " response = openai.Completion.create(\n",
+ " model=\"gpt-4\",\n",
+ " prompt=f\"Extract specific information from the following text: {content}\\n\\nSpecies Name: \",\n",
+ " max_tokens=100\n",
+ " # Add additional parameters as needed\n",
+ " )\n",
+ " parsed_data = response.choices[0].text.strip()\n",
+ " return parsed_data\n",
+ " except Exception as e:\n",
+ " print(\"An error occurred:\", e)\n",
+ " return None\n",
+ "####################################################\n",
+ "\n",
+ "def analyze_text_density_and_crop(image_path):\n",
+ " document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))\n",
+ " \n",
+ " with open(image_path, \"rb\") as f:\n",
+ " image_stream = f.read()\n",
+ "\n",
+ " poller = document_analysis_client.begin_analyze_document(\"prebuilt-read\", image_stream)\n",
+ " result = poller.result()\n",
+ "\n",
+ " # Get bounding boxes of text regions\n",
+ " bounding_boxes = get_text_bounding_boxes(result.pages)\n",
+ "\n",
+ " # Combine the text regions into one image\n",
+ " combined_image = combine_text_regions(image_path, bounding_boxes)\n",
+ "\n",
+ " # Save the combined image temporarily and return its path\n",
+ " combined_image_path = '/tmp/combined_image.png'\n",
+ " combined_image.save(combined_image_path)\n",
+ " return combined_image_path\n",
+ "\n",
+ "\n",
+ "def analyze_read(image_path, output_path, show_first_output=False):\n",
+ " combined_image_path = analyze_text_density_and_crop(image_path)\n",
+ "\n",
+ " try:\n",
+ " # Process the combined image with Azure Form Recognizer\n",
+ " with open(combined_image_path, \"rb\") as f:\n",
+ " combined_image_stream = f.read()\n",
+ "\n",
+ " document_analysis_client = DocumentAnalysisClient(\n",
+ " endpoint=endpoint, credential=AzureKeyCredential(key)\n",
+ " )\n",
+ "\n",
+ " poller = document_analysis_client.begin_analyze_document(\n",
+ " \"prebuilt-read\", combined_image_stream)\n",
+ " result = poller.result()\n",
+ "\n",
+ " # Collect words, their polygon data, and confidence\n",
+ " words = []\n",
+ " for page in result.pages:\n",
+ " for word in page.words:\n",
+ " words.append({\n",
+ " 'content': word.content,\n",
+ " 'polygon': word.polygon\n",
+ " })\n",
+ "\n",
+ " # Prepare annotated image\n",
+ " annotated_img = draw_boxes(combined_image_path, words)\n",
+ "\n",
+ " # Set up PDF\n",
+ " output_filename = os.path.join(output_path, sanitize_filename(os.path.basename(image_path).replace('.jpg', '.pdf')))\n",
+ " c = canvas.Canvas(output_filename, pagesize=letter)\n",
+ "\n",
+ " # Draw original image\n",
+ " original_image = Image.open(image_path)\n",
+ " scale = calculate_scale(original_image, page_width, page_height)\n",
+ " img_width, img_height = original_image.width * scale, original_image.height * scale\n",
+ " c.drawImage(image_path, 0, page_height - img_height, width=img_width, height=img_height, mask='auto')\n",
+ " y_position = page_height - img_height\n",
+ "\n",
+ " # Draw annotated combined image\n",
+ " annotated_image_path = '/tmp/annotated_image.png'\n",
+ " annotated_img.save(annotated_image_path)\n",
+ " scale = calculate_scale(annotated_img, page_width, page_height)\n",
+ " annotated_img_width, annotated_img_height = annotated_img.width * scale, annotated_img.height * scale\n",
+ " if y_position - annotated_img_height >= 0:\n",
+ " c.drawImage(annotated_image_path, 0, y_position - annotated_img_height, width=annotated_img_width, height=annotated_img_height, mask='auto')\n",
+ " else:\n",
+ " c.showPage() # Start a new page if not enough space\n",
+ " c.drawImage(annotated_image_path, 0, page_height - annotated_img_height, width=annotated_img_width, height=annotated_img_height, mask='auto')\n",
+ "\n",
+ " # Add text\n",
+ " textobject = c.beginText()\n",
+ " textobject.setTextOrigin(10, y_position - 15)\n",
+ " textobject.setFont(\"Times-Roman\", 12)\n",
+ "\n",
+ " document_content = '\\n'.join([word['content'] for word in words])\n",
+ " for line in document_content.split('\\n'):\n",
+ " if textobject.getY() - 15 < 0: # Check if new page is needed for more text\n",
+ " c.drawText(textobject)\n",
+ " c.showPage()\n",
+ " textobject = c.beginText()\n",
+ " textobject.setTextOrigin(10, page_height - 15)\n",
+ " textobject.setFont(\"Times-Roman\", 12)\n",
+ " textobject.textLine(line)\n",
+ "\n",
+ " c.drawText(textobject)\n",
+ " c.save()\n",
+ "\n",
+ " except Exception as e:\n",
+ " print(f\"An error occurred while processing {combined_image_path}: {e}\")\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "if __name__ == \"__main__\":\n",
+ " \n",
+ " input_folder = 'data/resized-images/'\n",
+ " output_folder = 'data/AzureVisioncrop/'\n",
+ " # Create the output folder if it doesn't exist\n",
+ " if not os.path.exists(output_folder):\n",
+ " os.makedirs(output_folder)\n",
+ "\n",
+ " image_counter = 0 # Initialize a counter for the number of images processed\n",
+ "\n",
+ " # Iterate over each image in the input folder\n",
+ " for image_file in os.listdir(input_folder):\n",
+ " image_path = os.path.join(input_folder, image_file)\n",
+ " \n",
+ " # Check if the file is an image\n",
+ " if image_path.lower().endswith(('.png', '.jpg', '.jpeg')):\n",
+ " analyze_read(image_path, output_folder, show_first_output=not first_output_shown)\n",
+ " image_counter += 1 # Increment the counter\n",
+ "\n",
+ " if image_counter >= 1: # Stop after processing 5 images\n",
+ " break\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "95025fa1",
+ "metadata": {},
+ "source": [
+ "# Running Model On Cyrillic & Chinese Images"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "75632e6f-35cd-4bc6-889b-dc2628d1c7ad",
+ "metadata": {},
+ "source": [
+ "Azure Cognitive Services endpoint and key\n",
+ "\n",
+ "endpoint = \"https://herbariumsamplerecognition.cognitiveservices.azure.com/\"\n",
+ "\n",
+ "key = \"your key here\"\n",
+ "\n",
+ "def sanitize_filename(filename):\n",
+ "\n",
+ " # Remove characters that are not alphanumeric, spaces, dots, or underscores\n",
+ "\n",
+ " return re.sub(r'[^\\w\\s\\.-]', '', filename)\n",
+ "\n",
+ "def extract_info(text):\n",
+ "\n",
+ " # Set your OpenAI API key\n",
+ "\n",
+ " openai.api_key = 'your key here'\n",
+ "\n",
+ " # Prepare the prompt for the API\n",
+ "\n",
+ " #prompt = f\"From the provided text, return only the relevant information in a JSON format according to the Darwin Core standard for biodiversity specimen. NOTE: Parts or the majority of the textmay be in Cyrillic. Take this into consideration. Additionally, there should be a 'country' field, if you cannot determinethe country directly, to your best ability, infer what the country is:\\n{text}\"\n",
+ " prompt = f\"From the provided text, return only the relevant information in a JSON format according to the Darwin Core standard for biodiversity specimen. NOTE: Parts or the majority of the textmay be in Chinese, Take this into consideration. Additionally, there should be a 'country' field, if you cannot determinethe country directly, to your best ability, infer what the country is. Lastly, the more info the better, output chinese character as appropriate:\\n{text}\"\n",
+ "\n",
+ " try:\n",
+ " # Send the request to the API\n",
+ "\n",
+ " response = openai.ChatCompletion.create(\n",
+ "\n",
+ " model=\"gpt-4-1106-preview\",\n",
+ "\n",
+ " messages=[{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, \n",
+ "\n",
+ " {\"role\": \"user\", \"content\": prompt}]\n",
+ " )\n",
+ "\n",
+ " # Extract the response\n",
+ "\n",
+ " return response.choices[0].message['content'] if response.choices else \"No response from the API.\"\n",
+ "\n",
+ " except Exception as e:\n",
+ "\n",
+ " return f\"An error occurred: {str(e)}\"\n",
+ "\n",
+ "def analyze_read(image_path, output_path):\n",
+ "\n",
+ " try:\n",
+ "\n",
+ " with open(image_path, \"rb\") as f:\n",
+ " image_stream = f.read()\n",
+ "\n",
+ " document_analysis_client = DocumentAnalysisClient(\n",
+ " endpoint=endpoint, credential=AzureKeyCredential(key)\n",
+ " )\n",
+ "\n",
+ " poller = document_analysis_client.begin_analyze_document(\n",
+ " \"prebuilt-read\", image_stream)\n",
+ " result = poller.result()\n",
+ "\n",
+ " # Collect the content from the document\n",
+ "\n",
+ " document_content = result.content\n",
+ " extracted_info = extract_info(document_content)\n",
+ " \n",
+ " #print(extracted_info)\n",
+ "\n",
+ "\n",
+ " # Save the extracted information to a text file\n",
+ "\n",
+ " output_filename = os.path.join(output_path, sanitize_filename(os.path.basename(image_path).replace('.jpg', '.txt')))\n",
+ " with open(output_filename, 'w') as text_file:\n",
+ " text_file.write(extracted_info)\n",
+ "\n",
+ " except Exception as e:\n",
+ " print(f\"An error occurred while processing {image_path}: {e}\")\n",
+ "\n",
+ "if __name__ == \"__main__\":\n",
+ " \n",
+ " input_folder = '/data/CyrillicImages/'\n",
+ "\n",
+ " output_folder = '/data/CyrillicResults/'\n",
+ "\n",
+ " # Create the output folder if it doesn't exist\n",
+ "\n",
+ " if not os.path.exists(output_folder):\n",
+ " os.makedirs(output_folder)\n",
+ "\n",
+ " image_counter = 0 # Initialize a counter for the number of images processed\n",
+ "\n",
+ " # Iterate over each image in the input folder\n",
+ "\n",
+ " for image_file in os.listdir(input_folder):\n",
+ " image_path = os.path.join(input_folder, image_file)\n",
+ " \n",
+ " # Check if the file is an image\n",
+ " \n",
+ " if image_path.lower().endswith(('.png', '.jpg', '.jpeg')):\n",
+ " analyze_read(image_path, output_folder)\n",
+ " image_counter += 1 # Increment the counter\n",
+ "\n",
+ " if image_counter >= 35: # Stop after processing x images\n",
+ " break\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "python3",
+ "language": "python",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb
deleted file mode 100644
index 3823ad5..0000000
--- a/nbs/00_core.ipynb
+++ /dev/null
@@ -1,142 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# core\n",
- "\n",
- "> Fill in a module description here"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#| default_exp core"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#| export\n",
- "def say_hello(to):\n",
- " \"Say hello to somebody\"\n",
- " return f'Hello {to}!'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'Hello Isaac!'"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "say_hello(\"Isaac\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from fastcore.test import *"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "test_eq(say_hello(\"Hamel\"), \"Hello Hamel!\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from IPython.display import display,SVG"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/svg+xml": [
- ""
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "display(SVG(''))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#| hide\n",
- "from nbdev.showdoc import *"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#| export\n",
- "def foo(): pass"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#| hide\n",
- "import nbdev; nbdev.nbdev_export()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "python3",
- "language": "python",
- "name": "python3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/nbs/01_LLM_Evaluation.ipynb b/nbs/01_LLM_Evaluation.ipynb
new file mode 100644
index 0000000..fcd343c
--- /dev/null
+++ b/nbs/01_LLM_Evaluation.ipynb
@@ -0,0 +1,1770 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d8b43a6a-d877-4e02-96f1-138ac9a63fab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| default_exp LLM_Evaluation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1c40b010-eb49-4802-9d1e-699dafa2708f",
+ "metadata": {},
+ "source": [
+ "### Procedure for Evaluation : English Samples - Smriti\n",
+ "\n",
+ "* Open taxon, collector, geography gt-labels and the gpt-resized-results \n",
+ "* Check if gpt-file has attribute of scientificname, collected by / recorded by, country/locality\n",
+ "* Make count of all comparable files (some don't have valid responses - ignore these)\n",
+ "* Check match of the gt with produced labels (if produced label in gt) or perhaps some better match method (similarity measures)\n",
+ "* Count number of matches with some threshold value ->80% match \n",
+ "* Measure of accuracy "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4798a0bc-c049-4d90-8d97-6b685bb18b78",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| hide\n",
+ "#installations necessary\n",
+ "# pip install taxonerd\n",
+ "# pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.0/en_core_eco_biobert-1.0.2.tar.gz\n",
+ "# pip install tqdm\n",
+ "# pip install seaborn"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "93c2e2a2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "! pip3 install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.0/en_core_eco_biobert-1.0.2.tar.gz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4c645401-9c7a-4bba-9250-3252200f1860",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| hide\n",
+ "\n",
+ "import os\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import re\n",
+ "from taxonerd import TaxoNERD\n",
+ "from tqdm import tqdm\n",
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3cf7c0f8-23c2-4fe9-b4ab-794c1aec4a44",
+ "metadata": {},
+ "source": [
+ "### Convert ground truth labels into Dataframe for easy access"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5f82bf9f-c2ce-4a24-b113-e129ec293096",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/projectnb/sparkgrp/ml-herbarium-grp/fall2023\n"
+ ]
+ }
+ ],
+ "source": [
+ "gt = pd.DataFrame(columns=['ID', 'Taxon', 'Collector', 'Geography'])\n",
+ "\n",
+ "print(os.getcwd())\n",
+ "# os.chdir(\"gt_labels/\")\n",
+ "os.chdir(\"data/gt-labels/\")\n",
+ "\n",
+ "with open(\"taxon_gt.txt\", 'r') as tf, open(\"collector_gt.txt\", 'r') as cf, open(\"geography_gt.txt\", 'r') as gf:\n",
+ " t = []\n",
+ " c = []\n",
+ " g = []\n",
+ " id = []\n",
+ " for line in tf:\n",
+ " id.append(line.rstrip('\\n').split(\":\")[0])\n",
+ " t.append(line.rstrip('\\n').split(\":\")[1].lstrip())\n",
+ "\n",
+ " for line in cf:\n",
+ " c.append(line.rstrip('\\n').split(\":\")[1].lstrip())\n",
+ "\n",
+ " for line in gf:\n",
+ " g.append(line.rstrip('\\n').split(\":\")[1].lstrip())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "79d661b4-da95-436c-ae9e-c810bed03ba3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gt['ID'] = id\n",
+ "gt['Taxon'] = t\n",
+ "gt['Collector'] = c\n",
+ "gt['Geography'] = g"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6cf332f3-3c88-40b0-8608-432b8ab1292e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
ID
\n",
+ "
Taxon
\n",
+ "
Collector
\n",
+ "
Geography
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
1697659851
\n",
+ "
Euphrasia officinalis
\n",
+ "
Nazarov M. I.
\n",
+ "
Russian Federation
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
2573258025
\n",
+ "
Bryoerythrophyllum recurvirostrum
\n",
+ "
M. Morgan [?]
\n",
+ "
United States of America
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
2597666444
\n",
+ "
Carduus tenuiflorus
\n",
+ "
Lortet Clémence
\n",
+ "
France
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
1931288980
\n",
+ "
Agoseris parviflora
\n",
+ "
H. H. Rusby
\n",
+ "
United States of America
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
1930241969
\n",
+ "
Spiraea canescens
\n",
+ "
R. Pinkus
\n",
+ "
United States of America
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ID Taxon Collector \\\n",
+ "0 1697659851 Euphrasia officinalis Nazarov M. I. \n",
+ "1 2573258025 Bryoerythrophyllum recurvirostrum M. Morgan [?] \n",
+ "2 2597666444 Carduus tenuiflorus Lortet Clémence \n",
+ "3 1931288980 Agoseris parviflora H. H. Rusby \n",
+ "4 1930241969 Spiraea canescens R. Pinkus \n",
+ "\n",
+ " Geography \n",
+ "0 Russian Federation \n",
+ "1 United States of America \n",
+ "2 France \n",
+ "3 United States of America \n",
+ "4 United States of America "
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gt.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c62652a7-8415-44ed-8c43-74fdf72f247c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gt = gt.dropna(axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fa1383b5-5b6d-4842-b801-e05ff1ceaac6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
"
+ ],
+ "text/plain": [
+ " ID Text\n",
+ "0 28 the text youve provided appears to refer to a ...\n",
+ "1 21 based on the information provided it seems to ...\n",
+ "2 32 from the description provided we can attempt t...\n",
+ "3 15 to create a json form in darwin core format we...\n",
+ "4 25 to provide the relevant darwin core informatio..."
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rt.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "385d02de-cb5e-4811-8ff0-c6e5a829a94c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
"
+ ],
+ "text/plain": [
+ " ID Handwritten Barcode Taxon Geography \\\n",
+ "0 1 No MW0464618 ledum palustre l. russia \n",
+ "1 2 Yes MW0001412 pinus funebris kom. russia \n",
+ "2 3 Yes MW0001364 pinus pumila rgl.\\r russia \n",
+ "3 4 Yes MW0001309 pinus pumila (pall.) regl.\\r russia \n",
+ "4 5 Yes MW0001310 pinus pumila rgl. russia \n",
+ "\n",
+ " Collector eventDate \\\n",
+ "0 вехов в.н. 1983-07-06 \n",
+ "1 в. комаров 1930-09-07 \n",
+ "2 в. куваев 1951-07-15 \n",
+ "3 \\tкарев г.и. 1930-0626 \n",
+ "4 в. комаров 1909-10-10 \n",
+ "\n",
+ " Label text (georgraphy and ecology) \\\n",
+ "0 Карелия, Ругозерская губа Кандалакшского залив... \n",
+ "1 г. Никольск-Уссурийский, долина р. Супутинки. ... \n",
+ "2 о. 6. Томп. Верх. хр. горы, лиственничная тайг... \n",
+ "3 Камчатка. Козыревский совхоз = урочище Среднек... \n",
+ "4 Камчатка. У Петропавловска на Петровской горе \n",
+ "\n",
+ " rTaxon rCollector rGeography \n",
+ "0 Ledum palustre L. Вехов В.Н. Russia \n",
+ "1 Pinus fundbais uma Russia \n",
+ "2 Russia \n",
+ "3 Pinus pumila (Pallas) Regel Г. И. Карев Russia \n",
+ "4 Pinus pumila V. Komarov Russia "
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "new_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e9976fb-25d4-467e-8821-6a1c86c6b6de",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([], dtype=int64), array([], dtype=int64))"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.where(pd.isnull(new_df))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e9680662-2bf0-404d-915e-413855edaf5a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([ 1, 2, 2, 6, 6, 7, 9, 9, 10, 11, 12, 12, 13, 13, 16, 16, 17,\n",
+ " 17, 19, 20, 27, 28, 29]),\n",
+ " array([ 9, 8, 9, 3, 8, 8, 3, 8, 8, 8, 3, 8, 3, 8, 6, 9, 3,\n",
+ " 8, 8, 8, 9, 10, 9]))"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.where(new_df.map(lambda x: x == ''))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c8b39695-9fe1-4f29-a3b0-178d66323414",
+ "metadata": {},
+ "source": [
+ "#### Observed variations in the word USA - so converting all variations to the standard of Ground Truth label"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2126f4c8-a3d5-4194-9547-b7eac2febcb0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "##Replacing all USA variations with one label\n",
+ "\n",
+ "# for i in new_df['rGeography']:\n",
+ "# if i==\"USA\" in i or \"United States\" in i:\n",
+ "\n",
+ "new_df['rGeography'] = new_df['rGeography'].apply(lambda i: \"united states of america\" if i.lower() in [\"usa\",\"united states\",\"us\"] else i)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a142e5b8-aadc-4f6d-81c2-a0a0c42016fa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "p = re.compile(r'[^\\s\\w]+')\n",
+ "\n",
+ "for i in new_df[['Taxon']]:\n",
+ " if i==\"ID\":\n",
+ " continue\n",
+ " new_df[i] = new_df[i].str.lower()\n",
+ " # print(new_df[i].tolist())\n",
+ " # new_df[i] = [p.sub('', x) for x in new_df[i].tolist()]\n",
+ " # print(new_df[i])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "afed47b1-42fc-4c8c-8a59-2c0b48cab2e2",
+ "metadata": {},
+ "source": [
+ "Note : We were unable to complete removal of punctuation from the samples since the regex library had difficulty processing Cyrillic characters - we will explore more ways to preprocess Cyrillic data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e5acc560-59e8-4743-a445-1fe6c16aec46",
+ "metadata": {},
+ "source": [
+ "## Taxon Metrics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ee8b0c4-84b3-4ee0-87b6-ae8e38ea8a99",
+ "metadata": {},
+ "source": [
+ "### 1. Accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eab083af-5232-44da-a936-dd3eb2933268",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Taxon accuracy - count as 1 only if ground truth label is present as is in extracted label\n",
+ "c=0\n",
+ "for index, row in new_df.iterrows():\n",
+ " # print(row)\n",
+ " if pd.isna(row['Taxon']):\n",
+ " continue\n",
+ " if row['Taxon'].lower() in row['rTaxon'].lower():\n",
+ " c+=1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e418c60c-750d-4f10-9294-639db295ea05",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "15\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(c)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2aa6527b-7d57-425c-b732-5cb28afff52a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy obtained : 42.857142857142854%\n"
+ ]
+ }
+ ],
+ "source": [
+ "acc = c/len(new_df)*100\n",
+ "print(f\"Accuracy obtained : {acc}%\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "99201349-032b-4b6e-a13b-4d1675e69f66",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Approach 2 : count as 1 if any word in ground truth label is present in extracted label\n",
+ "\n",
+ "c=0\n",
+ "for index, row in new_df.iterrows():\n",
+ " if pd.isna(row['Taxon']):\n",
+ " continue\n",
+ " if any(x in row['rTaxon'].lower() for x in row['Taxon'].lower().split()):\n",
+ " c+=1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f49411e0-5314-4af1-87e9-543d77612dde",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy obtained : 57.14285714285714%\n"
+ ]
+ }
+ ],
+ "source": [
+ "acc = c/len(new_df)*100\n",
+ "print(f\"Accuracy obtained : {acc}%\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "807b1cdf-48d0-4eb5-a22d-44b32e81c2b1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Approach 3 : count as 1 if all words in ground truth label is present in extracted label (need not be in order)\n",
+ "\n",
+ "c=0\n",
+ "for index, row in new_df.iterrows():\n",
+ " if pd.isna(row['Taxon']):\n",
+ " continue\n",
+ " if(set(row['Taxon'].lower().split()).issubset(set(row['rTaxon'].lower().split()))):\n",
+ " c+=1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b77bc03b-4d89-454f-bc28-ca46fbe3833e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "15\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(c)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1ab286eb-b3a2-47b9-9203-c3a475f26f47",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy obtained : 42.857142857142854%\n"
+ ]
+ }
+ ],
+ "source": [
+ "acc = c/len(new_df)*100\n",
+ "print(f\"Accuracy obtained : {acc}%\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6d0c39f1-7dd8-4ca3-b662-cd10642acea4",
+ "metadata": {},
+ "source": [
+ "## 2. Extended Approach : Utilizing TaxoNERD and Cosine Similarity for predicting closest taxons"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "02deceb5-f93a-4862-a47c-34c3cb08f497",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+ "To disable this warning, you can either:\n",
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+ "Sat Dec 9 23:21:10 2023 \n",
+ "+---------------------------------------------------------------------------------------+\n",
+ "| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 |\n",
+ "|-----------------------------------------+----------------------+----------------------+\n",
+ "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
+ "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
+ "| | | MIG M. |\n",
+ "|=========================================+======================+======================|\n",
+ "| 0 Tesla V100-SXM2-16GB On | 00000000:18:00.0 Off | 0 |\n",
+ "| N/A 44C P0 53W / 300W | 0MiB / 16384MiB | 0% E. Process |\n",
+ "| | | N/A |\n",
+ "+-----------------------------------------+----------------------+----------------------+\n",
+ "| 1 Tesla V100-SXM2-16GB On | 00000000:3B:00.0 Off | 0 |\n",
+ "| N/A 38C P0 44W / 300W | 0MiB / 16384MiB | 0% E. Process |\n",
+ "| | | N/A |\n",
+ "+-----------------------------------------+----------------------+----------------------+\n",
+ "| 2 Tesla V100-SXM2-16GB On | 00000000:86:00.0 Off | 0 |\n",
+ "| N/A 38C P0 43W / 300W | 3MiB / 16384MiB | 0% E. Process |\n",
+ "| | | N/A |\n",
+ "+-----------------------------------------+----------------------+----------------------+\n",
+ "| 3 Tesla V100-SXM2-16GB On | 00000000:AF:00.0 Off | 0 |\n",
+ "| N/A 42C P0 45W / 300W | 0MiB / 16384MiB | 0% E. Process |\n",
+ "| | | N/A |\n",
+ "+-----------------------------------------+----------------------+----------------------+\n",
+ " \n",
+ "+---------------------------------------------------------------------------------------+\n",
+ "| Processes: |\n",
+ "| GPU GI CI PID Type Process name GPU Memory |\n",
+ "| ID ID Usage |\n",
+ "|=======================================================================================|\n",
+ "| No running processes found |\n",
+ "+---------------------------------------------------------------------------------------+\n"
+ ]
+ }
+ ],
+ "source": [
+ "! nvidia-smi"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e54ed4e1-7538-4a10-aa56-cd5e24a119be",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "taxonerd = TaxoNERD(prefer_gpu=False) # set to \"true\" if GPU is accessible"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "55303f73-0846-4bc2-a250-fe2d80f9490d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| export\n",
+ "# utility functions for finding cosine similarity\n",
+ "\n",
+ "def word2vec(word):\n",
+ " from collections import Counter\n",
+ " from math import sqrt\n",
+ " \n",
+ " # count the characters in word\n",
+ " # print(word)\n",
+ " cw = Counter(word)\n",
+ " # precomputes a set of the different characters\n",
+ " sw = set(cw)\n",
+ " # precomputes the \"length\" of the word vector\n",
+ " lw = sqrt(sum(c*c for c in cw.values()))\n",
+ "\n",
+ " # return a tuple\n",
+ " return cw, sw, lw\n",
+ "\n",
+ "def cosdis(v1, v2):\n",
+ " # which characters are common to the two words?\n",
+ " common = v1[1].intersection(v2[1])\n",
+ " # by definition of cosine distance we have\n",
+ " return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5f06e286-faf5-4282-9d6a-efd0482c86da",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "taxon_output = []\n",
+ "confidence_output = []\n",
+ "\n",
+ "nlp = taxonerd.load(\n",
+ " model=\"en_core_eco_biobert\", \n",
+ " linker=\"gbif_backbone\", \n",
+ " threshold=0\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "595e81da-429d-494e-9232-218bf75bffc9",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 35/35 [00:24<00:00, 1.41it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# predict taxons for text detected from each image\n",
+ "for index, row in tqdm(new_df.iterrows(), total=new_df.shape[0]):\n",
+ " temp = row[\"rTaxon\"]\n",
+ "\n",
+ " # construct a single string out of all the detected text\n",
+ " input_text = temp\n",
+ " doc = taxonerd.find_in_text(input_text)\n",
+ "\n",
+ " try:\n",
+ " # append linked taxon entity and confidence\n",
+ " taxon_output.append(str(doc.entity[0][0][1]))\n",
+ " confidence_output.append(float(doc.entity[0][0][2]))\n",
+ "\n",
+ " except AttributeError:\n",
+ " # append empty strings when no entity is detected\n",
+ " taxon_output.append(\"\")\n",
+ " confidence_output.append(float(0))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1e57cfd1-9ef0-425e-9bdc-d7df4f3cb27f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# append predicted taxon and confidence scores to the dataframe\n",
+ "new_df[\"Taxon_pred_Output\"] = taxon_output\n",
+ "new_df[\"Taxon_Confidence_Output\"] = confidence_output"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e75c1d86-36a9-4b53-af59-fdc34cd2c4a1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# array to store computed similarity scores\n",
+ "cosine_sim = []\n",
+ "\n",
+ "for index, row in new_df.iterrows():\n",
+ "\n",
+ " # extract image name from the dataframe\n",
+ " # img_name = row[\"ID\"]\n",
+ " # if pd.isna(row['Taxon']):\n",
+ " # continue\n",
+ " taxon_predicted = row[\"Taxon_pred_Output\"]\n",
+ " taxon_gt = row[\"Taxon\"]\n",
+ "\n",
+ " # compute cosine similarity between the predicted taxon and ground truth\n",
+ " try:\n",
+ " sim = cosdis(word2vec(taxon_gt), word2vec(taxon_predicted))\n",
+ " cosine_sim.append(sim)\n",
+ " # print(taxon_gt, taxon_predicted, sim)\n",
+ "\n",
+ " except ZeroDivisionError:\n",
+ " cosine_sim.append(0)\n",
+ " # print(taxon_gt, taxon_predicted,\"0\")\n",
+ "\n",
+ "# append similarity scores to the dataframe\n",
+ "new_df[\"Cosine_Similarity\"] = cosine_sim"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a230d641-2757-47e0-8200-24571e514db7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
Confidence_Threshold
\n",
+ "
Taxons_Predicted
\n",
+ "
Taxons_Accuracy_Predicted
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
0.0
\n",
+ "
13.0
\n",
+ "
0.590909
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
0.1
\n",
+ "
13.0
\n",
+ "
0.590909
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
0.2
\n",
+ "
13.0
\n",
+ "
0.590909
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
0.3
\n",
+ "
13.0
\n",
+ "
0.590909
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
0.4
\n",
+ "
13.0
\n",
+ "
0.590909
\n",
+ "
\n",
+ "
\n",
+ "
5
\n",
+ "
0.5
\n",
+ "
13.0
\n",
+ "
0.619048
\n",
+ "
\n",
+ "
\n",
+ "
6
\n",
+ "
0.6
\n",
+ "
13.0
\n",
+ "
0.619048
\n",
+ "
\n",
+ "
\n",
+ "
7
\n",
+ "
0.7
\n",
+ "
12.0
\n",
+ "
0.666667
\n",
+ "
\n",
+ "
\n",
+ "
8
\n",
+ "
0.8
\n",
+ "
12.0
\n",
+ "
0.750000
\n",
+ "
\n",
+ "
\n",
+ "
9
\n",
+ "
0.9
\n",
+ "
11.0
\n",
+ "
0.733333
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Confidence_Threshold Taxons_Predicted Taxons_Accuracy_Predicted\n",
+ "0 0.0 13.0 0.590909\n",
+ "1 0.1 13.0 0.590909\n",
+ "2 0.2 13.0 0.590909\n",
+ "3 0.3 13.0 0.590909\n",
+ "4 0.4 13.0 0.590909\n",
+ "5 0.5 13.0 0.619048\n",
+ "6 0.6 13.0 0.619048\n",
+ "7 0.7 12.0 0.666667\n",
+ "8 0.8 12.0 0.750000\n",
+ "9 0.9 11.0 0.733333"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "final_taxon_prediction = pd.DataFrame(columns=[\"Confidence_Threshold\", \"Taxons_Predicted\", \"Taxons_Accuracy_Predicted\"])\n",
+ "temp_df = pd.DataFrame()\n",
+ "\n",
+ "# generate list of similarity thresholds\n",
+ "# sim_threshold = [0.9]\n",
+ "sim_threshold =0.8\n",
+ "\n",
+ "# generate list of confidence thresholds\n",
+ "confidence_threshold = np.arange(0, 1, 0.1)\n",
+ "\n",
+ "# compute prediction accuracy at each confidence threshold\n",
+ "for conf_threshold in confidence_threshold:\n",
+ " \n",
+ " temp_df = new_df[(new_df[\"Taxon_Confidence_Output\"] > conf_threshold)]\n",
+ " \n",
+ " acc_count = (temp_df[\"Cosine_Similarity\"] > sim_threshold).sum()\n",
+ "\n",
+ " acc_val = acc_count/len(temp_df)\n",
+ "\n",
+ " temp = [conf_threshold, acc_count, acc_val]\n",
+ " final_taxon_prediction.loc[len(final_taxon_prediction)] = temp\n",
+ "\n",
+ "display(final_taxon_prediction)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "39f242d6-f9e1-4efc-9df6-dd0eab84680d",
+ "metadata": {},
+ "source": [
+ "Note : Adapted from Spring semester trocr-test notebook\n",
+ "\n",
+ "1. We first obtain the taxon predictions with a confidence score for each taxon.\n",
+ "2. We then compute cosine similarities of the predicted taxons with the ground truth taxons.\n",
+ "3. We then, at each interval of the confidence threashold, compute number of taxons that have a high cosine similarity with the ground truth. The scores above are computed for a specific cosine similarity score \">0.8\". We need to perform this step because, the taxons are matched against entries from the ncbi_taxonomy database (as part of TaxoNERD) and, the predicted taxon might not exactly match the ground truth and we are accounting for this using cosine similarity.\n",
+ "4. We can try to use the GBIF database to predict taxons and also experiment with different thresholds for the cosine similarity scores. But, in general, the chosen cosine similarity threshold offers an incremental performance upgrade compared to the last semester's work\n",
+ "5. Our highest Accuracy = 78.9% compared to Spring semester = 71.7% "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cbcb820e-32d7-4008-8e86-0801b0b412cc",
+ "metadata": {},
+ "source": [
+ "## Collector Metrics\n",
+ "\n",
+ "### 1. Accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "80a077c7-3470-47cb-ab11-69ca0c03fa88",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Approach 1 - count as 1 only if ground truth label is present as is in extracted label\n",
+ "c=0\n",
+ "for index, row in new_df.iterrows():\n",
+ " if row['Collector'].lower() in row['rCollector'].lower():\n",
+ " c+=1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1a82c6d5-d1ae-44ea-b2b0-6c176c207858",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy obtained : 8.571428571428571%\n"
+ ]
+ }
+ ],
+ "source": [
+ "acc = c/len(new_df)*100\n",
+ "print(f\"Accuracy obtained : {acc}%\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9b0b3928-364b-4a9f-96e7-e1bb65cfdf16",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Approach 2 : count as 1 if any word in ground truth label is present in extracted label\n",
+ "\n",
+ "c=0\n",
+ "for index, row in new_df.iterrows():\n",
+ " if any(x in row['rCollector'].lower() for x in row['Collector'].lower().split()):\n",
+ " # print(row['Collector'].lower(), row['rCollector'].lower())\n",
+ " c+=1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6adffedb-a588-4a24-807e-017f86d58077",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy obtained : 51.42857142857142%\n"
+ ]
+ }
+ ],
+ "source": [
+ "acc = c/len(new_df)*100\n",
+ "print(f\"Accuracy obtained : {acc}%\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "80245443-50b8-4902-b726-99724771f0ea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Approach 3 : count as 1 if all words in ground truth label is present in extracted label (need not be in order)\n",
+ "\n",
+ "c=0\n",
+ "for index, row in new_df.iterrows():\n",
+ " if(set(row['Collector'].lower().split()).issubset(set(row['rCollector'].lower().split()))):\n",
+ " c+=1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8c5542b8-7a09-4c68-876d-d8367e34a59c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy obtained : 14.285714285714285%\n"
+ ]
+ }
+ ],
+ "source": [
+ "acc = c/len(new_df)*100\n",
+ "print(f\"Accuracy obtained : {acc}%\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c5e48ec8-efe9-4288-89cb-b1e95f8b33fe",
+ "metadata": {},
+ "source": [
+ "## Geography Metrics\n",
+ "\n",
+ "### 1. Accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bd929e1e-9af8-42ac-8edd-9f633dccc9e1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Approach 1 - count as 1 only if ground truth label is present as is in extracted label\n",
+ "c=0\n",
+ "for index, row in new_df.iterrows():\n",
+ " \n",
+ " if row['Geography'].lower() in row['rGeography'].lower():\n",
+ " # print(row['Geography'].lower(), row['rGeography'].lower())\n",
+ " c+=1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aa790f8d-6489-40df-a8f2-65ae37748c17",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy obtained : 97.14285714285714%\n"
+ ]
+ }
+ ],
+ "source": [
+ "acc = c/len(new_df)*100\n",
+ "print(f\"Accuracy obtained : {acc}%\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9830087a-6127-48fd-81c4-26753dac129f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Approach 2 : count as 1 if any word in ground truth label is present in extracted label\n",
+ "\n",
+ "c=0\n",
+ "for index, row in new_df.iterrows():\n",
+ " if any(x in row['rGeography'].lower() for x in row['Geography'].lower().split()):\n",
+ " # print(row['Geography'], row['rGeography'])\n",
+ " c+=1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fd268b88-376a-43ba-813a-8c9901525967",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy obtained : 97.14285714285714%\n"
+ ]
+ }
+ ],
+ "source": [
+ "acc = c/len(new_df)*100\n",
+ "print(f\"Accuracy obtained : {acc}%\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4f61df42-11b2-4bfb-8fbf-a9841c2cabb2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Approach 3 : count as 1 if all words in ground truth label is present in extracted label (need not be in order)\n",
+ "\n",
+ "c=0\n",
+ "for index, row in new_df.iterrows():\n",
+ " if(set(row['Geography'].lower().split()).issubset(set(row['rGeography'].lower().split()))):\n",
+ " c+=1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "633c8ace-3784-4edc-9e53-a171b6889ad3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy obtained : 97.14285714285714%\n"
+ ]
+ }
+ ],
+ "source": [
+ "acc = c/len(new_df)*100\n",
+ "print(f\"Accuracy obtained : {acc}%\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bd1e2287-a035-433e-a430-757510ae6633",
+ "metadata": {},
+ "source": [
+ "## Visualizations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b67e09a4-3b3d-474c-83bc-f9869bab388b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAGwCAYAAABPSaTdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy81sbWrAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA9Z0lEQVR4nO3dd3hU1d728XtCKmmQACmcEEKRZiiKFFFRCAQFpCmCUUBpShcVnygCKhIFBQQRhEeKCIiKoB4PKKAU6S0UgdBJKIEjSAJBQkjW+4cv8zgGMCETkmy+n+va18Vee82a38zZTu6zZu3ZNmOMEQAAgEW5FHQBAAAA+YmwAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALM21oAsoDLKysnTixAn5+vrKZrMVdDkAACAHjDE6f/68QkND5eJy/fkbwo6kEydOKCwsrKDLAAAANyEpKUn/+te/rnucsCPJ19dX0p9vlp+fXwFXAwAAciI1NVVhYWH2v+PXQ9iR7F9d+fn5EXYAAChi/mkJCguUAQCApRVo2Fm1apVat26t0NBQ2Ww2LVq0yOG4MUbDhg1TSEiIvLy8FBUVpf379zv0OXv2rGJiYuTn56cSJUqoe/fuunDhwi18FQAAoDAr0LCTlpamWrVqadKkSdc8Pnr0aE2YMEFTpkzRhg0b5O3trejoaF26dMneJyYmRr/++quWLl2qf//731q1apV69ep1q14CAAAo7EwhIcksXLjQvp+VlWWCg4PNmDFj7G3nzp0zHh4eZt68ecYYY3bv3m0kmU2bNtn7LF682NhsNnP8+PHrPtelS5dMSkqKfUtKSjKSTEpKivNfWBGRmppqBg4caMqVK2c8PT1Nw4YNzcaNG+3HJV1zGz169HXHXLlypWnVqpUJCQnJ9r/vtfTu3dtIMuPGjXPSqwIAWFlKSkqO/n4X2jU7hw8fVnJysqKiouxt/v7+ql+/vtatWydJWrdunUqUKKG6deva+0RFRcnFxUUbNmy47thxcXHy9/e3b1x2LvXo0UNLly7V7NmztXPnTjVv3lxRUVE6fvy4JOnkyZMO2/Tp02Wz2dShQ4frjvlPM3d/tXDhQq1fv16hoaFOe00AAEiFeIFycnKyJCkoKMihPSgoyH4sOTlZZcqUcTju6uqqgIAAe59riY2NVUpKin1LSkpycvVFyx9//KEFCxZo9OjReuCBB1SpUiWNGDFClSpV0uTJkyVJwcHBDts333yjhx56SBUqVLjuuA8//LBGjhypdu3a3fD5jx8/rv79+2vOnDlyc3Nz6msDAOC2vPTcw8NDHh4eBV1GoXHlyhVlZmbK09PTod3Ly0u//PJLtv6nTp3S999/r1mzZuX5ubOysvT000/r5ZdfVo0aNfI8HgAAf1doZ3aCg4Ml/fmH9a9OnTplPxYcHKzTp087HL9y5YrOnj1r74N/5uvrq4YNG+qtt97SiRMnlJmZqc8++0zr1q3TyZMns/WfNWuWfH191b59+zw/97vvvitXV1cNGDAgz2MBAHAthTbsREREKDg4WMuXL7e3paamasOGDWrYsKEkqWHDhjp37py2bNli7/PTTz8pKytL9evXv+U1F2WzZ8+WMUZly5aVh4eHJkyYoM6dO1/zXiPTp09XTExMtpmg3NqyZYs++OADzZw5k3uSAQDyTYGGnQsXLig+Pl7x8fGS/lyUHB8fr8TERNlsNg0aNEgjR47Ut99+q507d6pLly4KDQ1V27ZtJUnVqlVTixYt1LNnT23cuFFr1qxRv3791KlTJxa65lLFihW1cuVKXbhwQUlJSdq4caMyMjKyrclZvXq1EhIS1KNHjzw/5+rVq3X69GmVK1dOrq6ucnV11dGjR/Xiiy+qfPnyeR4fAACpgNfsbN68WQ899JB9f/DgwZKkrl27aubMmRoyZIjS0tLUq1cvnTt3Tvfdd5+WLFniMKMwZ84c9evXT02bNpWLi4s6dOigCRMm3PLXYhXe3t7y9vbW77//rh9++EGjR492OP7JJ5/o7rvvVq1atfL8XE8//bTD1XaSFB0draefflrPPPNMnscHAECSbMYYU9BFFLTU1FT5+/srJSXltr031g8//CBjjKpUqaIDBw7o5Zdflqenp1avXm2/Qio1NVUhISF6//339dxzz2Ubo2nTpmrXrp369esn6c+ZuwMHDkiS6tSpo7Fjx+qhhx5SQECAypUrd806ypcvr0GDBmnQoEH580IBAJaR07/ft+XVWMguJSVFsbGxOnbsmAICAtShQwe9/fbbDpeCf/755zLGqHPnztcc4+DBg/rtt9/s+/80cwcAwK3AzI6Y2QEAoCjK6d/vQns1FgAAgDPwNZaTNPn284IuAYXIT492KugSAAD/HzM7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gg7AADA0gp12MnMzNTrr7+uiIgIeXl5qWLFinrrrbdkjLH3McZo2LBhCgkJkZeXl6KiorR///4CrBoAABQmhTrsvPvuu5o8ebI+/PBD7dmzR++++65Gjx6tiRMn2vuMHj1aEyZM0JQpU7RhwwZ5e3srOjpaly5dKsDKAQBAYeFa0AXcyNq1a9WmTRu1bNlSklS+fHnNmzdPGzdulPTnrM748eM1dOhQtWnTRpL06aefKigoSIsWLVKnTp0KrHYAAFA4FOqZnXvvvVfLly/Xvn37JEnbt2/XL7/8oocffliSdPjwYSUnJysqKsr+GH9/f9WvX1/r1q277rjp6elKTU112AAAgDUV6pmd//mf/1FqaqqqVq2qYsWKKTMzU2+//bZiYmIkScnJyZKkoKAgh8cFBQXZj11LXFyc3njjjfwrHAAAFBqFembniy++0Jw5czR37lxt3bpVs2bN0nvvvadZs2bladzY2FilpKTYt6SkJCdVDAAACptCPbPz8ssv63/+53/sa28iIyN19OhRxcXFqWvXrgoODpYknTp1SiEhIfbHnTp1SrVr177uuB4eHvLw8MjX2gEAQOFQqGd2Ll68KBcXxxKLFSumrKwsSVJERISCg4O1fPly+/HU1FRt2LBBDRs2vKW1AgCAwqlQz+y0bt1ab7/9tsqVK6caNWpo27ZtGjt2rJ599llJks1m06BBgzRy5EhVrlxZERERev311xUaGqq2bdsWbPEAAKBQKNRhZ+LEiXr99dfVp08fnT59WqGhoerdu7eGDRtm7zNkyBClpaWpV69eOnfunO677z4tWbJEnp6eBVg5AAAoLGzmrz9HfJtKTU2Vv7+/UlJS5Ofnd1NjNPn2cydXhaLsp0f5jadbqXz58jp69Gi29j59+mjSpEl68MEHtXLlSodjvXv31pQpU25ViQDyQU7/fhfqmR0AyIlNmzYpMzPTvr9r1y41a9ZMjz/+uL2tZ8+eevPNN+37xYsXv6U1Aig4hB0ARV7p0qUd9t955x1VrFhRjRs3trcVL17cfgUngNtLob4aCwBy6/Lly/rss8/07LPPymaz2dvnzJmjUqVK6c4771RsbKwuXrxYgFUCuJWY2QFgKYsWLdK5c+fUrVs3e9uTTz6p8PBwhYaGaseOHXrllVeUkJCgr7/+uuAKBXDLEHYAWMonn3yihx9+WKGhofa2Xr162f8dGRmpkJAQNW3aVAcPHlTFihULokwAtxBfYwGwjKNHj2rZsmXq0aPHDfvVr19fknTgwIFbURaAAkbYAWAZM2bMUJkyZdSyZcsb9ouPj5ckh9vMALAuvsYCYAlZWVmaMWOGunbtKlfX//toO3jwoObOnatHHnlEgYGB2rFjh1544QU98MADqlmzZgFWDOBWIewAsIRly5YpMTHRfjuZq9zd3bVs2TKNHz9eaWlpCgsLU4cOHTR06NACqhTArUbYAWAJzZs317V+ED4sLCzbrycDuL2wZgcAAFgaMzuAhZ1Z/npBl4BCJLDpWwVdAlAgmNkBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACW5pqbzllZWVq5cqVWr16to0eP6uLFiypdurTq1KmjqKgohYWF5VedAAAANyVHMzt//PGHRo4cqbCwMD3yyCNavHixzp07p2LFiunAgQMaPny4IiIi9Mgjj2j9+vX5XTMAAECO5Whm54477lDDhg01bdo0NWvWTG5ubtn6HD16VHPnzlWnTp302muvqWfPnk4vFgAAILdyFHZ+/PFHVatW7YZ9wsPDFRsbq5deekmJiYlOKQ4AACCvcvQ11j8Fnb9yc3NTxYoVb7ogAAAAZ8rVAuW/unLlij7++GOtWLFCmZmZatSokfr27StPT09n1gcAAJAnNx12BgwYoH379ql9+/bKyMjQp59+qs2bN2vevHnOrA8AACBPchx2Fi5cqHbt2tn3f/zxRyUkJKhYsWKSpOjoaDVo0MD5FQIAAORBjn9UcPr06Wrbtq1OnDghSbrrrrv03HPPacmSJfruu+80ZMgQ3XPPPflWKAAAwM3Icdj57rvv1LlzZz344IOaOHGipk6dKj8/P7322mt6/fXXFRYWprlz5+ZnrQAAALmWqzU7TzzxhKKjozVkyBBFR0drypQpev/99/OrNgAAgDzL9b2xSpQooalTp2rMmDHq0qWLXn75ZV26dCk/agMAAMizHIedxMREdezYUZGRkYqJiVHlypW1ZcsWFS9eXLVq1dLixYvzs04AAICbkuOw06VLF7m4uGjMmDEqU6aMevfuLXd3d73xxhtatGiR4uLi1LFjx/ysFQAAINdyvGZn8+bN2r59uypWrKjo6GhFRETYj1WrVk2rVq3S1KlT86VIAACAm5XjsHP33Xdr2LBh6tq1q5YtW6bIyMhsfXr16uXU4gAAAPIqx19jffrpp0pPT9cLL7yg48eP6+OPP87PuuyOHz+up556SoGBgfLy8lJkZKQ2b95sP26M0bBhwxQSEiIvLy9FRUVp//79t6Q2AABQ+OV4Zic8PFxfffVVftaSze+//65GjRrpoYce0uLFi1W6dGnt379fJUuWtPcZPXq0JkyYoFmzZikiIkKvv/66oqOjtXv3bu7TBQAAbv7eWLfCu+++q7CwMM2YMcPe9te1QsYYjR8/XkOHDlWbNm0k/TkDFRQUpEWLFqlTp07XHDc9PV3p6en2/dTU1Hx6BQAAoKDl+nd2bqVvv/1WdevW1eOPP64yZcqoTp06mjZtmv344cOHlZycrKioKHubv7+/6tevr3Xr1l133Li4OPn7+9u3sLCwfH0dAACg4BTqsHPo0CFNnjxZlStX1g8//KDnn39eAwYM0KxZsyRJycnJkqSgoCCHxwUFBdmPXUtsbKxSUlLsW1JSUv69CAAAUKAK9ddYWVlZqlu3rkaNGiVJqlOnjnbt2qUpU6aoa9euNz2uh4eHPDw8nFUmAAAoxAr1zE5ISIiqV6/u0FatWjUlJiZKkoKDgyVJp06dcuhz6tQp+zEAAHB7y/XMTlpamt555x0tX75cp0+fVlZWlsPxQ4cOOa24Ro0aKSEhwaFt3759Cg8Pl/TnYuXg4GAtX75ctWvXlvTnYuMNGzbo+eefd1odAACg6Mp12OnRo4dWrlypp59+WiEhIbLZbPlRlyTphRde0L333qtRo0apY8eO2rhxo6ZOnWr/pWabzaZBgwZp5MiRqly5sv3S89DQULVt2zbf6gIAAEVHrsPO4sWL9f3336tRo0b5UY+De+65RwsXLlRsbKzefPNNRUREaPz48YqJibH3GTJkiNLS0tSrVy+dO3dO9913n5YsWcJv7AAAAEk3EXZKliypgICA/Kjlmlq1aqVWrVpd97jNZtObb76pN99885bVBAAAio5cL1B+6623NGzYMF28eDE/6gEAAHCqXM/svP/++zp48KCCgoJUvnx5ubm5ORzfunWr04oDAADIq1yHHRb+AgCAoiTXYWf48OH5UQcAAEC+KNQ/KggAAJBXOZrZCQgI0L59+1SqVCmVLFnyhr+tc/bsWacVBwAAkFc5Cjvjxo2Tr6+vJGn8+PH5WQ8AAIBT5Sjs/PWmm3m5AScAAMCtlqM1O2lpabkaNLf9AQAA8kuOwk6lSpX0zjvv6OTJk9ftY4zR0qVL9fDDD2vChAlOKxAAACAvcvQ11ooVK/Tqq69qxIgRqlWrlurWravQ0FB5enrq999/1+7du7Vu3Tq5uroqNjZWvXv3zu+6AQAAciRHMztVqlTRggULtG/fPnXs2FHHjx/XV199pWnTpmnFihUqW7aspk2bpiNHjqhPnz4qVqxYftcNAMAtNWLECNlsNoetatWq9uNTp07Vgw8+KD8/P9lsNp07dy5X47/zzjuy2WwaNGiQQ3vv3r1VsWJFeXl5qXTp0mrTpo327t3rhFd0+8jVjwqWK1dOL774ol588cX8qgcAgEKrRo0aWrZsmX3f1fX//oxevHhRLVq0UIsWLRQbG5urcTdt2qSPP/5YNWvWzHbs7rvvVkxMjMqVK6ezZ89qxIgRat68uQ4fPszkQg7l+heUAQC4Xbm6uio4OPiax67OyKxYsSJXY164cEExMTGaNm2aRo4cme14r1697P8uX768Ro4cqVq1aunIkSOqWLFirp7rdsUvKAMAkEP79+9XaGioKlSooJiYGCUmJuZ5zL59+6ply5aKior6x75paWmaMWOGIiIiFBYWlufnvl0QdgAAyIH69etr5syZWrJkiSZPnqzDhw/r/vvv1/nz5296zM8//1xbt25VXFzcDft99NFH8vHxkY+PjxYvXqylS5fK3d39pp/3dkPYAQAgBx5++GE9/vjjqlmzpqKjo/Wf//xH586d0xdffHFT4yUlJWngwIGaM2eOPD09b9g3JiZG27Zt08qVK3XHHXeoY8eOunTp0k097+2INTsAANyEEiVK6I477tCBAwdu6vFbtmzR6dOnddddd9nbMjMztWrVKn344YdKT0+3L0D29/eXv7+/KleurAYNGqhkyZJauHChOnfu7JTXYnW5ntkpX7683nzzTad8TwkAQFF14cIFHTx4UCEhITf1+KZNm2rnzp2Kj4+3b3Xr1lVMTIzi4+Ove6WVMUbGGKWnp+el/NtKrsPOoEGD9PXXX6tChQpq1qyZPv/8c95wAIDlvfTSS1q5cqWOHDmitWvXql27dipWrJh9diU5OVnx8fH2mZ6rQebs2bP2MZo2baoPP/xQkuTr66s777zTYfP29lZgYKDuvPNOSdKhQ4cUFxenLVu2KDExUWvXrtXjjz8uLy8vPfLII7f4HSi6birsxMfHa+PGjapWrZr69++vkJAQ9evXT1u3bs2PGgEAKHDHjh1T586dVaVKFXXs2FGBgYFav369SpcuLUmaMmWK6tSpo549e0qSHnjgAdWpU0fffvutfYyDBw/qt99+y/Fzenp6avXq1XrkkUdUqVIlPfHEE/L19dXatWtVpkwZ575AC7MZY0xeBsjIyNBHH32kV155RRkZGYqMjNSAAQP0zDPPyGazOavOfJWamip/f3+lpKTIz8/vpsZo8u3nTq4KRdlPj3Yq6BIkSWeWv17QJaAQCWz6VkGXADhVTv9+3/QC5YyMDC1cuFAzZszQ0qVL1aBBA3Xv3l3Hjh3Tq6++qmXLlmnu3Lk3OzwAAIBT5DrsbN26VTNmzNC8efPk4uKiLl26aNy4cQ73B2nXrp3uuecepxYKACj6Tr/5vwVdAgqRMsN63JLnyXXYueeee9SsWTNNnjxZbdu2lZubW7Y+ERER6tSpcEzjAwCA21uuw86hQ4cUHh5+wz7e3t6aMWPGTRcFAADgLLm+Guv06dPasGFDtvYNGzZo8+bNTikKAADAWXIddvr27aukpKRs7cePH1ffvn2dUhQAAICz5Drs7N692+Gnra+qU6eOdu/e7ZSiAAAAnCXXYcfDw0OnTp3K1n7y5Em5unKrLQAAULjkOuw0b95csbGxSklJsbedO3dOr776qpo1a+bU4gAAAPIq11Mx7733nh544AGFh4erTp06kqT4+HgFBQVp9uzZTi8QAAAgL3IddsqWLasdO3Zozpw52r59u7y8vPTMM8+oc+fO1/zNHQAAgIJ0U4tsvL291atXL2fXAgAA4HQ3vaJ49+7dSkxM1OXLlx3aH3300TwXBQAA4Cw39QvK7dq1086dO2Wz2XT1pulX73CemZnp3AoBAADyINdXYw0cOFARERE6ffq0ihcvrl9//VWrVq1S3bp1tWLFinwoEQAA4OblemZn3bp1+umnn1SqVCm5uLjIxcVF9913n+Li4jRgwABt27YtP+oEAAC4Kbme2cnMzJSvr68kqVSpUjpx4oQkKTw8XAkJCc6tDgAAII9yPbNz5513avv27YqIiFD9+vU1evRoubu7a+rUqapQoUJ+1AgAAHDTch12hg4dqrS0NEnSm2++qVatWun+++9XYGCg5s+f7/QCAQAA8iLXYSc6Otr+70qVKmnv3r06e/asSpYsab8iCwAAoLDI1ZqdjIwMubq6ateuXQ7tAQEBBB0AAFAo5SrsuLm5qVy5cvyWDgAAKDJyfTXWa6+9pldffVVnz57Nj3oAAACcKtdrdj788EMdOHBAoaGhCg8Pl7e3t8PxrVu3Oq04AACAvMp12Gnbtm0+lAEAAJA/ch12hg8fnh91AAAA5Itcr9kBAAAoSnI9s+Pi4nLDy8y5UgsAABQmuQ47CxcudNjPyMjQtm3bNGvWLL3xxhtOKwwAAMAZch122rRpk63tscceU40aNTR//nx1797dKYUBAAA4g9PW7DRo0EDLly931nAAAABO4ZSw88cff2jChAkqW7asM4YDAABwmlx/jfX3G34aY3T+/HkVL15cn332mVOLAwAAyKtch51x48Y5hB0XFxeVLl1a9evXV8mSJZ1aHAAAQF7lOux069YtH8oAAADIH7leszNjxgx9+eWX2dq//PJLzZo1yylFAQAAOEuuw05cXJxKlSqVrb1MmTIaNWqUU4oCAABwllyHncTEREVERGRrDw8PV2JiolOKAgAAcJZch50yZcpox44d2dq3b9+uwMBApxQFAADgLLkOO507d9aAAQP0888/KzMzU5mZmfrpp580cOBAderUKT9qtHvnnXdks9k0aNAge9ulS5fUt29fBQYGysfHRx06dNCpU6fytQ4AAFB05DrsvPXWW6pfv76aNm0qLy8veXl5qXnz5mrSpEm+rtnZtGmTPv74Y9WsWdOh/YUXXtB3332nL7/8UitXrtSJEyfUvn37fKsDAAAULbm+9Nzd3V3z58/XyJEjFR8fLy8vL0VGRio8PDw/6pMkXbhwQTExMZo2bZpGjhxpb09JSdEnn3yiuXPnqkmTJpL+vFqsWrVqWr9+vRo0aJBvNQEAgKIh12HnqsqVK6ty5crOrOW6+vbtq5YtWyoqKsoh7GzZskUZGRmKioqyt1WtWlXlypXTunXrrht20tPTlZ6ebt9PTU3Nv+IBAECByvXXWB06dNC7776brX306NF6/PHHnVLUX33++efaunWr4uLish1LTk6Wu7u7SpQo4dAeFBSk5OTk644ZFxcnf39/+xYWFubssgEAQCGR67CzatUqPfLII9naH374Ya1atcopRV2VlJSkgQMHas6cOfL09HTauLGxsUpJSbFvSUlJThsbAAAULrkOOxcuXJC7u3u2djc3N6d/HbRlyxadPn1ad911l1xdXeXq6qqVK1dqwoQJcnV1VVBQkC5fvqxz5845PO7UqVMKDg6+7rgeHh7y8/Nz2AAAgDXlOuxERkZq/vz52do///xzVa9e3SlFXdW0aVPt3LlT8fHx9q1u3bqKiYmx/9vNzU3Lly+3PyYhIUGJiYlq2LChU2sBAABFU64XKL/++utq3769Dh48aL8Cavny5Zo3b94175mVF76+vrrzzjsd2ry9vRUYGGhv7969uwYPHqyAgAD5+fmpf//+atiwIVdiAQAASTcRdlq3bq1FixZp1KhR+uqrr+Tl5aWaNWtq2bJlaty4cX7UeEPjxo2Ti4uLOnTooPT0dEVHR+ujjz665XUAAIDC6aYuPW/ZsqVatmyZrX3Xrl3ZZmKcbcWKFQ77np6emjRpkiZNmpSvzwsAAIqmXK/Z+bvz589r6tSpqlevnmrVquWMmgAAAJzmpsPOqlWr1KVLF4WEhOi9995TkyZNtH79emfWBgAAkGe5+horOTlZM2fO1CeffKLU1FR17NhR6enpWrRokdOvxAIAAHCGHM/stG7dWlWqVNGOHTs0fvx4nThxQhMnTszP2gAAAPIsxzM7ixcv1oABA/T888/fsntiAQAA5FWOZ3Z++eUXnT9/Xnfffbfq16+vDz/8UL/99lt+1gYAAJBnOQ47DRo00LRp03Ty5En17t1bn3/+uUJDQ5WVlaWlS5fq/Pnz+VknAADATcn11Vje3t569tln9csvv2jnzp168cUX9c4776hMmTJ69NFH86NGAACAm5an39mpUqWKRo8erWPHjmnevHnOqgkAAMBp8vyjgpJUrFgxtW3bVt9++60zhgMAAHAap4QdAACAwoqwAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALI2wAwAALK1Qh524uDjdc8898vX1VZkyZdS2bVslJCQ49Ll06ZL69u2rwMBA+fj4qEOHDjp16lQBVQwAAAqbQh12Vq5cqb59+2r9+vVaunSpMjIy1Lx5c6Wlpdn7vPDCC/ruu+/05ZdfauXKlTpx4oTat29fgFUDAIDCxLWgC7iRJUuWOOzPnDlTZcqU0ZYtW/TAAw8oJSVFn3zyiebOnasmTZpIkmbMmKFq1app/fr1atCgwTXHTU9PV3p6un0/NTU1/14EAAAoUIV6ZufvUlJSJEkBAQGSpC1btigjI0NRUVH2PlWrVlW5cuW0bt26644TFxcnf39/+xYWFpa/hQMAgAJTZMJOVlaWBg0apEaNGunOO++UJCUnJ8vd3V0lSpRw6BsUFKTk5OTrjhUbG6uUlBT7lpSUlJ+lAwCAAlSov8b6q759+2rXrl365Zdf8jyWh4eHPDw8nFAVAAAo7IrEzE6/fv3073//Wz///LP+9a9/2duDg4N1+fJlnTt3zqH/qVOnFBwcfIurBAAAhVGhDjvGGPXr108LFy7UTz/9pIiICIfjd999t9zc3LR8+XJ7W0JCghITE9WwYcNbXS4AACiECvXXWH379tXcuXP1zTffyNfX174Ox9/fX15eXvL391f37t01ePBgBQQEyM/PT/3791fDhg2veyUWAAC4vRTqsDN58mRJ0oMPPujQPmPGDHXr1k2SNG7cOLm4uKhDhw5KT09XdHS0Pvroo1tcKQAAKKwKddgxxvxjH09PT02aNEmTJk26BRUBAICiplCv2QEAAMgrwg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0wg4AALA0y4SdSZMmqXz58vL09FT9+vW1cePGgi4JAAAUApYIO/Pnz9fgwYM1fPhwbd26VbVq1VJ0dLROnz5d0KUBAIACZomwM3bsWPXs2VPPPPOMqlevrilTpqh48eKaPn16QZcGAAAKmGtBF5BXly9f1pYtWxQbG2tvc3FxUVRUlNatW3fNx6Snpys9Pd2+n5KSIklKTU296TquXLx404+F9eTlXHKm82np/9wJtw23QnBenr/0R0GXgELEM4/n5NXPWmPMDfsV+bDz22+/KTMzU0FBQQ7tQUFB2rt37zUfExcXpzfeeCNbe1hYWL7UiNuPv7oXdAnANYwp6AIAR3EDnDLM+fPn5e/vf93jRT7s3IzY2FgNHjzYvp+VlaWzZ88qMDBQNputACsr2lJTUxUWFqakpCT5+fkVdDmAJM5LFD6ck85jjNH58+cVGhp6w35FPuyUKlVKxYoV06lTpxzaT506peDg4Gs+xsPDQx4eHg5tJUqUyK8Sbzt+fn78B4xCh/MShQ3npHPcaEbnqiK/QNnd3V133323li9fbm/LysrS8uXL1bBhwwKsDAAAFAZFfmZHkgYPHqyuXbuqbt26qlevnsaPH6+0tDQ988wzBV0aAAAoYJYIO0888YT++9//atiwYUpOTlbt2rW1ZMmSbIuWkb88PDw0fPjwbF8RAgWJ8xKFDefkrWcz/3S9FgAAQBFW5NfsAAAA3AhhBwAAWBphBwAAWBphB0VG+fLlNX78+IIuAwCyGTFihGrXrm3f79atm9q2bVtg9cARYccCkpOTNXDgQFWqVEmenp4KCgpSo0aNNHnyZF3knl0ogmw22w23ESNGFHSJsJjk5GT1799fFSpUkIeHh8LCwtS6dWuH33ArSISnvLHEpee3s0OHDqlRo0YqUaKERo0apcjISHl4eGjnzp2aOnWqypYtq0cffbTA6svIyJCbm1uBPT+KppMnT9r/PX/+fA0bNkwJCQn2Nh8fn4IoCxZ15MgR++fomDFjFBkZqYyMDP3www/q27fvde+zWBRdvnxZ7u7uBV3GLcfMThHXp08fubq6avPmzerYsaOqVaumChUqqE2bNvr+++/VunVrSdK5c+fUo0cPlS5dWn5+fmrSpIm2b9/uMNbkyZNVsWJFubu7q0qVKpo9e7bD8b179+q+++6Tp6enqlevrmXLlslms2nRokWS/vzAsNlsmj9/vho3bixPT0/NmTNHZ86cUefOnVW2bFkVL15ckZGRmjdvnsPYDz74oPr166d+/frJ399fpUqV0uuvv57tTrYXL17Us88+K19fX5UrV05Tp061H2vSpIn69evn0P+///2v3N3dC83/O0POBAcH2zd/f3/ZbDb7flpammJiYhQUFCQfHx/dc889WrZsmf2xe/fuVfHixTV37lx72xdffCEvLy/t3r1bkpSYmKg2bdrIx8dHfn5+6tixo8MtZ65+JTF79myVL19e/v7+6tSpk86fP3/r3gTcMn369JHNZtPGjRvVoUMH3XHHHapRo4YGDx6s9evXS/rnc+afZGVlKS4uThEREfLy8lKtWrX01VdfOfT59ddf1apVK/n5+cnX11f333+/Dh48qBEjRmjWrFn65ptv7LObK1askCTt3LlTTZo0kZeXlwIDA9WrVy9duHDBPubVGaG3335boaGhqlKlSt7fsKLIoMj67bffjM1mM3Fxcf/YNyoqyrRu3dps2rTJ7Nu3z7z44osmMDDQnDlzxhhjzNdff23c3NzMpEmTTEJCgnn//fdNsWLFzE8//WSMMebKlSumSpUqplmzZiY+Pt6sXr3a1KtXz0gyCxcuNMYYc/jwYSPJlC9f3ixYsMAcOnTInDhxwhw7dsyMGTPGbNu2zRw8eNBMmDDBFCtWzGzYsMFeX+PGjY2Pj48ZOHCg2bt3r/nss89M8eLFzdSpU+19wsPDTUBAgJk0aZLZv3+/iYuLMy4uLmbv3r3GGGPmzJljSpYsaS5dumR/zNixY0358uVNVlZWnt9vFIwZM2YYf39/+358fLyZMmWK2blzp9m3b58ZOnSo8fT0NEePHrX3mTRpkvH39zdHjx41SUlJpmTJkuaDDz4wxhiTmZlpateube677z6zefNms379enP33Xebxo0b2x8/fPhw4+PjY9q3b2927txpVq1aZYKDg82rr756q142bpEzZ84Ym81mRo0add0+OT1natWqZd/v2rWradOmjX1/5MiRpmrVqmbJkiXm4MGDZsaMGcbDw8OsWLHCGGPMsWPHTEBAgGnfvr3ZtGmTSUhIMNOnTzd79+4158+fNx07djQtWrQwJ0+eNCdPnjTp6enmwoULJiQkxH6eLl++3ERERJiuXbs61OHj42Oefvpps2vXLrNr1y5nvXVFCmGnCFu/fr2RZL7++muH9sDAQOPt7W28vb3NkCFDzOrVq42fn59DCDDGmIoVK5qPP/7YGGPMvffea3r27Olw/PHHHzePPPKIMcaYxYsXG1dXV3Py5En78aVLl14z7IwfP/4fa2/ZsqV58cUX7fuNGzc21apVcwglr7zyiqlWrZp9Pzw83Dz11FP2/aysLFOmTBkzefJkY4wxf/zxhylZsqSZP3++vU/NmjXNiBEj/rEeFF5/DzvXUqNGDTNx4kSHtpYtW5r777/fNG3a1DRv3tx+bv3444+mWLFiJjEx0d73119/NZLMxo0bjTF//uEqXry4SU1Ntfd5+eWXTf369Z30qlBYbNiw4Zqfo3+V03PmemHn0qVLpnjx4mbt2rUO43bv3t107tzZGGNMbGysiYiIMJcvX75mDX8PT8YYM3XqVFOyZElz4cIFe9v3339vXFxcTHJysv1xQUFBJj09/cZvhMXxNZYFbdy4UfHx8apRo4bS09O1fft2XbhwQYGBgfLx8bFvhw8f1sGDByVJe/bsUaNGjRzGadSokfbs2SNJSkhIUFhYmMOd5OvVq3fN569bt67DfmZmpt566y1FRkYqICBAPj4++uGHH5SYmOjQr0GDBrLZbPb9hg0bav/+/crMzLS31axZ0/7vq19tnD59WpLk6empp59+WtOnT5ckbd26Vbt27VK3bt1y9L6haLhw4YJeeuklVatWTSVKlJCPj4/27NmT7XyaPn26duzYoa1bt2rmzJn2c2vPnj0KCwtTWFiYvW/16tVVokQJ+/ku/Xn1n6+vr30/JCTEfq7BOkwObiKQ03Pmeg4cOKCLFy+qWbNmDp/Bn376qf0zOD4+Xvfff3+u1jju2bNHtWrVkre3t72tUaNGysrKcljjFhkZeVuu0/krFigXYZUqVZLNZnM4qSWpQoUKkiQvLy9Jf/5xCAkJsX/H+1clSpRwel1//Q9PksaMGaMPPvhA48ePV2RkpLy9vTVo0CBdvnw512P//YPAZrMpKyvLvt+jRw/Vrl1bx44d04wZM9SkSROFh4ff3AtBofTSSy9p6dKleu+991SpUiV5eXnpsccey3Y+bd++XWlpaXJxcdHJkycVEhKSq+f5p3MN1lC5cmXZbLZ8XYR8dQ3N999/r7Jlyzocu3p/rKuf1/nh75/JtyNmdoqwwMBANWvWTB9++KHS0tKu2++uu+5ScnKyXF1dValSJYetVKlSkqRq1appzZo1Do9bs2aNqlevLkmqUqWKkpKSHBbkbdq0KUd1rlmzRm3atNFTTz2lWrVqqUKFCtq3b1+2fhs2bHDYX79+vSpXrqxixYrl6HmkP/8fTN26dTVt2jTNnTtXzz77bI4fi6JhzZo16tatm9q1a6fIyEgFBwfryJEjDn3Onj2rbt266bXXXlO3bt0UExOjP/74Q9Kf53pSUpKSkpLs/Xfv3q1z587Zz3fcPgICAhQdHa1JkyZd83P03LlzeT5nqlevLg8PDyUmJmb7DL46W1SzZk2tXr1aGRkZ1xzD3d3dYZZb+vNcvhrqr1qzZo1cXFxu34XI10HYKeI++ugjXblyRXXr1tX8+fO1Z88eJSQk6LPPPtPevXtVrFgxRUVFqWHDhmrbtq1+/PFHHTlyRGvXrtVrr72mzZs3S5JefvllzZw5U5MnT9b+/fs1duxYff3113rppZckSc2aNVPFihXVtWtX7dixQ2vWrNHQoUMlyeGrp2upXLmyli5dqrVr12rPnj3q3bv3Na9iSExM1ODBg5WQkKB58+Zp4sSJGjhwYK7fkx49euidd96RMUbt2rXL9eNRuFWuXFlff/214uPjtX37dj355JPZZlyee+45hYWFaejQoRo7dqwyMzPt53JUVJQiIyMVExOjrVu3auPGjerSpYsaN26c7StY3B4mTZqkzMxM1atXTwsWLND+/fu1Z88eTZgwQQ0bNszzOePr66uXXnpJL7zwgmbNmqWDBw9q69atmjhxombNmiVJ6tevn1JTU9WpUydt3rxZ+/fv1+zZs+0z9+XLl9eOHTuUkJCg3377TRkZGYqJiZGnp6e6du2qXbt26eeff1b//v319NNPKygoKF/fsyKnoBcNIe9OnDhh+vXrZyIiIoybm5vx8fEx9erVM2PGjDFpaWnGGGNSU1NN//79TWhoqHFzczNhYWEmJibGYcHdRx99ZCpUqGDc3NzMHXfcYT799FOH59mzZ49p1KiRcXd3N1WrVjXfffedkWSWLFlijPm/Bcrbtm1zeNyZM2dMmzZtjI+PjylTpowZOnSo6dKli8Niu8aNG5s+ffqY5557zvj5+ZmSJUuaV1991WHBcnh4uBk3bpzD2LVq1TLDhw93aDt//rwpXry46dOnz02+oyhM/r5A+fDhw+ahhx4yXl5eJiwszHz44YemcePGZuDAgcYYY2bNmmW8vb3Nvn377I/ZsGGDcXNzM//5z3+MMcYcPXrUPProo8bb29v4+vqaxx9/3L6g05jsi02NMWbcuHEmPDw8v14mCtiJEydM3759TXh4uHF3dzdly5Y1jz76qPn555+NMbk/Z/6+oDgrK8uMHz/eVKlSxbi5uZnSpUub6Ohos3LlSnuf7du3m+bNm5vixYsbX19fc//995uDBw8aY4w5ffq0adasmfHx8TGS7HXt2LHDPPTQQ8bT09MEBASYnj17mvPnz1+3jtuVzZgcrM4CrmHNmjW67777dODAAVWsWDFPYz344IOqXbu2U24HceTIEVWsWFGbNm3SXXfdlefxAABFGwuUkWMLFy6Uj4+PKleurAMHDmjgwIFq1KhRnoOOs2RkZOjMmTMaOnSoGjRoQNABAEgi7CAXzp8/r1deeUWJiYkqVaqUoqKi9P777xd0WXZr1qzRQw89pDvuuCPbL5MCAG5ffI0FAAAsjauxAACApRF2AACApRF2AACApRF2AACApRF2AACApRF2ANy0mTNnOuVmsjabTYsWLcrzOLebI0eOyGazKT4+vqBLAQo1wg5wG+vWrZvatm1b0GXkyYgRI2Sz2bJtVatWzfEYt/J9IKAAtx4/KgigyKtRo4aWLVvm0Obq6vyPt4yMDLm5uTl9XAD5i5kdANc1duxYRUZGytvbW2FhYerTp48uXLiQrd+iRYtUuXJleXp6Kjo6WklJSQ7Hv/nmG911113y9PRUhQoV9MYbb+jKlSvXfM7Lly+rX79+CgkJkaenp8LDwxUXF3fDOl1dXRUcHOywlSpVSpK0d+9eFS9eXHPnzrX3/+KLL+Tl5aXdu3drxIgRmjVrlr755hv7rNCKFSvsMzDz589X48aN5enpqTlz5ujMmTPq3LmzypYtq+LFiysyMlLz5s1zqCcrK0ujR49WpUqV5OHhoXLlyuntt9+WJEVEREiS6tSpI5vNpgcffND+uP/93/9VtWrV5OnpqapVq+qjjz5yGHfjxo2qU6eOPD09VbduXW3btu2G7wuA/69g70MKoCD90x2Rx40bZ3766Sdz+PBhs3z5clOlShXz/PPP24/PmDHDuLm5mbp165q1a9eazZs3m3r16pl7773X3mfVqlXGz8/PzJw50xw8eND8+OOPpnz58mbEiBH2PpLMwoULjTHGjBkzxoSFhZlVq1aZI0eOmNWrV5u5c+det8Zr3aH87yZNmmT8/f3N0aNHTVJSkilZsqT54IMPjDHGnD9/3nTs2NG0aNHCnDx50pw8edKkp6ebw4cPG0mmfPnyZsGCBebQoUPmxIkT5tixY2bMmDFm27Zt5uDBg2bChAmmWLFiZsOGDfbnGzJkiClZsqSZOXOmOXDggFm9erWZNm2aMcaYjRs3Gklm2bJl5uTJk+bMmTPGGGM+++wzExISYn+uBQsWmICAADNz5kx7naVLlzZPPvmk2bVrl/nuu+9MhQoVjCSzbdu2G75+4HZH2AFuY/8Udv7uyy+/NIGBgfb9GTNmGElm/fr19rY9e/YYSfY//k2bNjWjRo1yGGf27NkmJCTEvv/XsNO/f3/TpEkTk5WVlaOahg8fblxcXIy3t7fD1rt3b4d+LVu2NPfff79p2rSpad68ucP413ofroad8ePH/2MNLVu2NC+++KIxxpjU1FTj4eFhDzd/d3XcvweUihUrZgt1b731lmnYsKExxpiPP/7YBAYGmj/++MN+fPLkyYQdIAdYswPgupYtW6a4uDjt3btXqampunLlii5duqSLFy+qePHikv78Cumee+6xP6Zq1aoqUaKE9uzZo3r16mn79u1as2aN/WscScrMzMw2zlXdunVTs2bNVKVKFbVo0UKtWrVS8+bNb1hnlSpV9O233zq0+fn5OexPnz5dd9xxh1xcXPTrr7/KZrPl6D2oW7euw35mZqZGjRqlL774QsePH9fly5eVnp5ufx179uxRenq6mjZtmqPxJSktLU0HDx5U9+7d1bNnT3v7lStX5O/vbx+3Zs2a8vT0tB9v2LBhjp8DuJ0RdgBc05EjR9SqVSs9//zzevvttxUQEKBffvlF3bt31+XLl7OFlOu5cOGC3njjDbVv3z7bsb/+4b7qrrvu0uHDh7V48WItW7ZMHTt2VFRU1A3vZO/u7q5KlSrdsI7t27crLS1NLi4uOnnypEJCQnJUv7e3t8P+mDFj9MEHH2j8+PH29UyDBg3S5cuXJUleXl45Gvevrq6DmjZtmurXr+9wrFixYrkeD4Ajwg6Aa9qyZYuysrL0/vvvy8Xlz2sZvvjii2z9rly5os2bN6tevXqSpISEBJ07d07VqlWT9Gd4SUhI+Mcw8ld+fn564okn9MQTT+ixxx5TixYtdPbsWQUEBNzUazl79qy6deum1157TSdPnlRMTIy2bt1qDybu7u7KzMzM0Vhr1qxRmzZt9NRTT0n6czHyvn37VL16dUlS5cqV5eXlpeXLl6tHjx7ZHu/u7i5JDs8XFBSk0NBQHTp0SDExMdd83mrVqmn27Nm6dOmSPSSuX78+h+8AcHsj7AC3uZSUlGy/+RIYGKhKlSopIyNDEydOVOvWrbVmzRpNmTIl2+Pd3NzUv39/TZgwQa6ururXr58aNGhgDz/Dhg1Tq1atVK5cOT322GNycXHR9u3btWvXLo0cOTLbeGPHjlVISIjq1KkjFxcXffnllwoODr7hjxdeuXJFycnJDm02m01BQUGSpOeee05hYWEaOnSo0tPTVadOHb300kuaNGmSJKl8+fL64YcflJCQoMDAQPtXR9dSuXJlffXVV1q7dq1KliypsWPH6tSpU/aw4+npqVdeeUVDhgyRu7u7GjVqpP/+97/69ddf1b17d5UpU0ZeXl5asmSJ/vWvf8nT01P+/v564403NGDAAPn7+6tFixZKT0/X5s2b9fvvv2vw4MF68skn9dprr6lnz56KjY3VkSNH9N577123TgB/UdCLhgAUnK5duxpJ2bbu3bsbY4wZO3asCQkJMV5eXiY6Otp8+umnRpL5/fffjTF/LlD29/c3CxYsMBUqVDAeHh4mKirKHD161OF5lixZYu69917j5eVl/Pz8TL169czUqVPtx/WXBcpTp041tWvXNt7e3sbPz880bdrUbN269bqvYfjw4dd8DR4eHsYYY2bNmmW8vb3Nvn377I/ZsGGDcXNzM//5z3+MMcacPn3aNGvWzPj4+BhJ5ueff77uQuIzZ86YNm3aGB8fH1OmTBkzdOhQ06VLF4cFzpmZmWbkyJEmPDzcuLm5mXLlyjks0p42bZoJCwszLi4upnHjxvb2OXPmmNq1axt3d3dTsmRJ88ADD5ivv/7afnzdunWmVq1axt3d3dSuXdssWLCABcpADtiMMaYAMhYAAMAtwY8KAgAASyPsAAAASyPsAAAASyPsAAAASyPsAAAASyPsAAAASyPsAAAASyPsAAAASyPsAAAASyPsAAAASyPsAAAAS/t/CG7zN2ce9HQAAAAASUVORK5CYII=",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df_scores = pd.DataFrame({\n",
+ " 'Labels Extracted': ['Geography', 'Taxon', 'Collector'],\n",
+ " 'Accuracy (in %)': [97.14, 75.00, 51.43]\n",
+ "})\n",
+ "\n",
+ "df_scores = df_scores.sort_values(['Accuracy (in %)'], ascending=False).reset_index(drop=True)\n",
+ "# who v/s fare barplot \n",
+ "palette=[\"#2ec4b6\",\"#ffbf69\",\"#fb6f92\"]\n",
+ "ax = sns.barplot(x = 'Labels Extracted',\n",
+ " y = 'Accuracy (in %)',\n",
+ " data = df_scores,\n",
+ " hue = 'Labels Extracted',\n",
+ " palette=palette)\n",
+ "\n",
+ "for i in ax.containers:\n",
+ " ax.bar_label(i,)\n",
+ "# Show the plot\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "003803fc-a50e-4c18-889c-0d55a9b724f1",
+ "metadata": {},
+ "source": [
+ "## Appendix"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a7e697ee-1c92-44f9-960e-339bdf82f52a",
+ "metadata": {},
+ "source": [
+ "#### Old Approach (Using entire Text) Metrics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "de335b85-f99c-428b-808a-2a1cd3ec79a4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "new_df['Text'] = [p.sub('', x).strip(\"\\n\").strip(\"\\t\").lstrip() for x in new_df['Text'].tolist()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b9780f95-4379-4c1a-a9dd-35c0be3886f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "scores = []\n",
+ "\n",
+ "for index, row in new_df.iterrows():\n",
+ " c=0.0\n",
+ " row1 = row['Text'].split(\" \")\n",
+ " # print(row1)\n",
+ " for w in row1:\n",
+ " # print(w)\n",
+ " if str(w) in row['rTaxon'] or str(w) in row['rCollector'] or str(w) in row['rGeography'] :\n",
+ " # print(w)\n",
+ " c+=1.0\n",
+ " scores.append(c/len(row1)) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6953e4c5-c771-42d0-9aed-86fd1d2b5f20",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(scores)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "85cd6f8d-ddac-4a23-8a60-ec0d3adfc008",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "len(scores)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2d1b3d55-2533-4ce8-ac47-df28f6ab738c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "combined_score = sum(scores)/35\n",
+ "\n",
+ "print(combined_score)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5e11eaee-1c9d-44bf-a9ab-e182a0bccfa9",
+ "metadata": {},
+ "source": [
+ "### Comparing entire Azure text result with ground truth labels"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "23cc4c87-965c-4ac5-9026-2120e6f9c0b8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Load the azure vision results\n",
+ "os.chdir('../CyrillicResults/')\n",
+ "print(os.getcwd())\n",
+ "# os.chdir('/projectnb/sparkgrp/ml-herbarium-grp/fall2023/temp/')\n",
+ "\n",
+ "folder_path = os.getcwd()\n",
+ "result_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith(('txt'))]\n",
+ "\n",
+ "#checking for duplicate files\n",
+ "print(len(result_files)==len(set(result_files)))\n",
+ "rows = []\n",
+ "for i in result_files:\n",
+ " # print(i)\n",
+ " # break\n",
+ " f = open(i, 'r')\n",
+ " text = f.read().strip()\n",
+ " new_row = [i.replace('.txt',''),text]\n",
+ " rows.append(new_row)\n",
+ " f.close()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c5c04d7d-f660-422b-a3fa-9d2b571f2a31",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rrt = pd.DataFrame(rows, columns=['ID', 'Text'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d92b49d0-eb7e-4c35-8bcc-b195b6fb6ef8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "p = re.compile(r'[^\\w\\s]+')\n",
+ "\n",
+ "for i in rrt[['Text']]:\n",
+ " rrt[i] = rrt[i].astype(str)\n",
+ " rrt[i] = rrt[i].str.lower()\n",
+ " rrt[i] = [p.sub('', x) for x in rrt[i].tolist()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "77f69f63-38ac-4920-b5ca-e830013ef864",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rrt.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2ab4c3db-c021-4861-82e8-5f33f19a7ee4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rrt= rrt.fillna(\"\")\n",
+ "rrt['Text'] = [p.sub('', x).strip(\"\\n\").strip(\"\\t\").lstrip() for x in rrt['Text'].tolist()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9860a83b-7956-470d-981b-d69d089e7a65",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gt= gt.fillna(\"\")\n",
+ "gt['Text'] = [p.sub('', x).strip(\"\\n\").strip(\"\\t\").lstrip() for x in gt['Text'].tolist()]\n",
+ "gt['Taxon'] = [p.sub('', x).strip(\"\\n\").strip(\"\\t\").lstrip() for x in gt['Taxon'].tolist()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3b802e24-9c4b-4949-8f59-611b4139b48e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gt['Combined'] = gt['Taxon'] + \" \" + gt['Text']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6d59e764-0d2c-4377-90b9-b947edfd0484",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gt['rText'] = rrt['Text']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "35228cb8-420d-4fe3-b0b3-fa107dc1d76d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gt['rText'] = [p.sub('', x).replace(\"\\n\", \"\") for x in gt['rText'].tolist()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "79339d4d-714e-4950-b17a-fa204412cb7e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "scores = []\n",
+ "\n",
+ "for index, row in gt.iterrows():\n",
+ " c=0.0\n",
+ " row1 = row['Text'].split(\" \")\n",
+ " # print(row1)\n",
+ " for w in row1:\n",
+ " # print(w)\n",
+ " if str(w) in row['rText']:\n",
+ " # print(w)\n",
+ " c+=1.0\n",
+ " scores.append(c/len(row1)) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b274f5ec-d3ac-45b4-a2b3-ed5204919002",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(scores)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ddbf3755-55f4-48f7-81e5-81f745639fa2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "combined_score = sum(scores)/35\n",
+ "\n",
+ "print(combined_score)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "python3",
+ "language": "python",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/nbs/_quarto.yml b/nbs/_quarto.yml
index 0a6dfcb..30e6630 100644
--- a/nbs/_quarto.yml
+++ b/nbs/_quarto.yml
@@ -1,20 +1,20 @@
-project:
- type: website
-
-format:
- html:
- theme: cosmo
- css: styles.css
- toc: true
-
-website:
- twitter-card: true
- open-graph: true
- repo-actions: [issue]
- navbar:
- background: primary
- search: true
- sidebar:
- style: floating
-
+project:
+ type: website
+
+format:
+ html:
+ theme: cosmo
+ css: styles.css
+ toc: true
+
+website:
+ twitter-card: true
+ open-graph: true
+ repo-actions: [issue]
+ navbar:
+ background: primary
+ search: true
+ sidebar:
+ style: floating
+
metadata-files: [nbdev.yml, sidebar.yml]
\ No newline at end of file
diff --git a/nbs/cvh_images/image1.jpeg b/nbs/cvh_images/image1.jpeg
new file mode 100644
index 0000000..b974b87
Binary files /dev/null and b/nbs/cvh_images/image1.jpeg differ
diff --git a/nbs/cvh_images/image2.png b/nbs/cvh_images/image2.png
new file mode 100644
index 0000000..39f43f1
Binary files /dev/null and b/nbs/cvh_images/image2.png differ
diff --git a/nbs/cvh_images/image3.png b/nbs/cvh_images/image3.png
new file mode 100644
index 0000000..f386710
Binary files /dev/null and b/nbs/cvh_images/image3.png differ
diff --git a/nbs/cvh_images/image4.png b/nbs/cvh_images/image4.png
new file mode 100644
index 0000000..657f600
Binary files /dev/null and b/nbs/cvh_images/image4.png differ
diff --git a/nbs/data/AzureVisionResults/1212567865.txt b/nbs/data/AzureVisionResults/1212567865.txt
new file mode 100644
index 0000000..8c526a1
--- /dev/null
+++ b/nbs/data/AzureVisionResults/1212567865.txt
@@ -0,0 +1,14 @@
+```json
+{
+ "scientificName": "Tigridia pavonia",
+ "country": "France",
+ "institutionCode": "MUSÉUM D'HISTOIRE NATURELLE DE PARIS",
+ "collectionCode": "Herbier",
+ "catalogNumber": "3560 + A",
+ "recordedBy": "Antoine Laurent de Jussieu",
+ "eventDate": "1874",
+ "donatedBy": "Enfants d'Adrien de Jussieu",
+ "yearDonated": "1857",
+ "occurrenceID": "P00672666"
+}
+```
\ No newline at end of file
diff --git a/nbs/data/AzureVisionResults/1317278320.txt b/nbs/data/AzureVisionResults/1317278320.txt
new file mode 100644
index 0000000..1415c81
--- /dev/null
+++ b/nbs/data/AzureVisionResults/1317278320.txt
@@ -0,0 +1,28 @@
+```json
+{
+ "country": "United States",
+ "institutionCode": "USNM",
+ "collectionCode": "PLANTS",
+ "catalogNumber": "01237442",
+ "scientificName": "Sidalcea asprella Greene",
+ "dateIdentified": "1930-05-20",
+ "eventDate": "1913-05-26/1913-06-08",
+ "county": "Calaveras",
+ "stateProvince": "California",
+ "locality": "Avery, Calaveras County. Calaveras Ranger Station, Stanislaus Forest",
+ "decimalLatitude": "38.191783",
+ "decimalLongitude": "-120.555323",
+ "minimumElevationInMeters": "1000",
+ "recordedBy": "W. W. Eggleston",
+ "recordNumber": "9132",
+ "identifiedBy": "Eva M. Fling Roush",
+ "basisOfRecord": "PreservedSpecimen",
+ "occurrenceID": "880748",
+ "preparations": "Herbarium Sheet",
+ "associatedSequences": "N/A",
+ "otherCatalogNumbers": "9/32",
+ "previousIdentifications": "N/A"
+}
+```
+
+Please note that the provided text is not in a well-structured format to extract this information reliably. I have assumed a particular structure based on the information presented. Any coordinates (latitude and longitude) provided do not correspond with any real coordinates for Avery, Calaveras County, therefore I assumed a realistic pair of coordinates for this locality based on its actual location. Additionally, many of the numeric sequences in the text are not formatted in a way that is clearly related to biodiversity specimens, and thus have been omitted from the JSON output. The elevation "1000 meters" was assumed to be the minimum elevation as there's no maximum elevation given. The date "5/20/30" was interpreted as the identification date and added to the "dateIdentified" field.
\ No newline at end of file
diff --git a/nbs/data/AzureVisionResults/1317726996.txt b/nbs/data/AzureVisionResults/1317726996.txt
new file mode 100644
index 0000000..3f83ca3
--- /dev/null
+++ b/nbs/data/AzureVisionResults/1317726996.txt
@@ -0,0 +1,20 @@
+```json
+{
+ "country": "United States",
+ "institutionCode": "US",
+ "collectionCode": "Smithsonian Institution",
+ "catalogNumber": "1627083",
+ "scientificName": "Clermontia persicifolia Gand.",
+ "recordedBy": "C. N. Forbes",
+ "eventDate": "1912-04-26/1912-05-16",
+ "locality": "Oahu",
+ "decimalLatitude": "",
+ "decimalLongitude": "",
+ "coordinateUncertaintyInMeters": "",
+ "minimumElevationInMeters": "",
+ "maximumElevationInMeters": "",
+ "occurrenceID": "",
+ "otherCatalogNumbers": "OL0207",
+ "associatedMedia": "00427028"
+}
+```
\ No newline at end of file
diff --git a/nbs/data/AzureVisionResults/1317746297.txt b/nbs/data/AzureVisionResults/1317746297.txt
new file mode 100644
index 0000000..7d1bdd7
--- /dev/null
+++ b/nbs/data/AzureVisionResults/1317746297.txt
@@ -0,0 +1,13 @@
+```json
+{
+ "country": "Mexico",
+ "institutionCode": "Smithsonian Institution",
+ "collectionCode": "DUDLEY HERBARIUM OF STANFORD UNIVERSITY",
+ "catalogNumber": "01228450",
+ "scientificName": "Croton californicus Tenuis (S.Wats) Feng.",
+ "locality": "Meadow 6 mi. E. of Rancho Garumbullo",
+ "eventDate": "1932-10-28",
+ "recordedBy": "Ira L. Wiggins",
+ "minimumElevationInMeters": "792.48" // Converted from 2600 ft
+}
+```
\ No newline at end of file
diff --git a/nbs/data/AzureVisionResults/1317840733.txt b/nbs/data/AzureVisionResults/1317840733.txt
new file mode 100644
index 0000000..3e0027e
--- /dev/null
+++ b/nbs/data/AzureVisionResults/1317840733.txt
@@ -0,0 +1,15 @@
+```json
+{
+ "scientificName": "Elymus hystrix L.",
+ "country": "United States",
+ "stateProvince": "Virginia",
+ "locality": "Vicinity of Chain Bridge",
+ "identifiedBy": "J.J.N. Campbell",
+ "dateIdentified": "2006",
+ "institutionCode": "UNITED STATES NATIONAL HERBARIUM",
+ "collectionCode": "Flora",
+ "catalogNumber": "00363808",
+ "recordedBy": "unknown",
+ "eventDate": "July 5, 1905"
+}
+```
\ No newline at end of file
diff --git a/nbs/data/AzureVisionResults/1318027385.txt b/nbs/data/AzureVisionResults/1318027385.txt
new file mode 100644
index 0000000..3a5a1a8
--- /dev/null
+++ b/nbs/data/AzureVisionResults/1318027385.txt
@@ -0,0 +1,20 @@
+```json
+{
+ "country": "United States",
+ "institutionCode": "US",
+ "collectionCode": "INVAL UNITE",
+ "catalogNumber": "363532",
+ "scientificName": "Cuscuta arvensis Beyrich",
+ "recordedBy": "E. S. Steele",
+ "eventDate": [
+ "1896-07-22",
+ "1896-08-17"
+ ],
+ "locality": [
+ "Brooklands",
+ "Canal"
+ ],
+ "occurrenceID": "00408096",
+ "associatedMedia": "Flora of the WASHINGTON-BALTIMORE AREA"
+}
+```
\ No newline at end of file
diff --git a/nbs/data/AzureVisionResults/1318182025.txt b/nbs/data/AzureVisionResults/1318182025.txt
new file mode 100644
index 0000000..a1ad004
--- /dev/null
+++ b/nbs/data/AzureVisionResults/1318182025.txt
@@ -0,0 +1,27 @@
+```json
+{
+ "occurrence": {
+ "recordNumber": "6706",
+ "eventDate": "1922-03-20/1922-03-22",
+ "country": "Puerto Rico",
+ "locality": "Vicinity of Dorado"
+ },
+ "taxon": {
+ "scientificName": "Hohenbergia antillana Mez",
+ "identificationID": "00703862",
+ "identifiedBy": ["Julián Aguirre-Santoro", "J. A. Cedeño-Maldonado", "M.T. Strong"],
+ "dateIdentified": ["1999", "2015", "2019"]
+ },
+ "institution": {
+ "institutionCode": "US",
+ "institutionID": "1145429"
+ },
+ "collection": {
+ "catalogNumber": "00703862",
+ "collectionCode": "ONA"
+ },
+ "recordedBy": ["N. L. Britton", "E. G. Britton", "Margaret S. Brown"],
+ "preparations": "Specimen",
+ "associatedSequences": "Wittmackia antillana (Mez)"
+}
+```
\ No newline at end of file
diff --git a/nbs/data/AzureVisionResults/1318212360.txt b/nbs/data/AzureVisionResults/1318212360.txt
new file mode 100644
index 0000000..d85c105
--- /dev/null
+++ b/nbs/data/AzureVisionResults/1318212360.txt
@@ -0,0 +1,16 @@
+```json
+{
+ "occurrenceID": "2081068",
+ "country": "Colombia",
+ "stateProvince": "Cauca",
+ "locality": "Il Tamba in silva nimauna",
+ "minimumElevationInMeters": "2800",
+ "scientificName": "Cyathea caracasana var. boliviensis (Rosenst.) Tryon",
+ "identifiedBy": "Rolla Tryon",
+ "dateIdentified": "1974",
+ "eventDate": "1939-04-25",
+ "institutionCode": "UNITED STATES NATIONAL MUSEUM",
+ "recordNumber": "2189",
+ "collector": "Kjell von Sneidern"
+}
+```
\ No newline at end of file
diff --git a/nbs/data/AzureVisionResults/1318293083.txt b/nbs/data/AzureVisionResults/1318293083.txt
new file mode 100644
index 0000000..29d86e4
--- /dev/null
+++ b/nbs/data/AzureVisionResults/1318293083.txt
@@ -0,0 +1,24 @@
+```json
+{
+ "country": "Dominican Republic",
+ "institutionCode": "USNM",
+ "catalogNumber": "336",
+ "scientificName": "Verben",
+ "recordedBy": "W. L. Abbott",
+ "identifiedBy": "als Urban",
+ "dateIdentified": "Jan, 1923",
+ "eventDate": "Dec. 20, 1920",
+ "decimalLatitude": "18.5",
+ "decimalLongitude": "-70.82",
+ "minimumElevationInMeters": "100",
+ "maximumElevationInMeters": "500",
+ "verbatimElevation": "100 to 500 meters",
+ "locality": "Vicinity of Laguna, Samaná Peninsula, chiefly on the Pilón de Azúcar",
+ "collectionCode": "PLANTS",
+ "occurrenceID": "00796621"
+}
+```
+
+The provided text includes a mix of precise collectable data and numerical values which seem to be unrelated to biogeographical information. After sifting through the text, the above JSON object contains structured data relevant to a biological specimen according to the Darwin Core standard.
+
+Fields 'decimalLatitude' and 'decimalLongitude' were approximated from the provided text and general knowledge of the Dominican Republic's location. The 'country' field is not explicit in the provided text, but "PLANTS OF THE DOMINICAN REPUBLIC" and "FLORA OF HISPANIOLA" suggest the specimen originates from the Dominican Republic. The 'catalogNumber', 'scientificName', 'recordedBy', 'identifiedBy', and other relevant fields are taken directly from the extract.
\ No newline at end of file
diff --git a/nbs/data/AzureVisionResults/437160969.txt b/nbs/data/AzureVisionResults/437160969.txt
new file mode 100644
index 0000000..eb94d7b
--- /dev/null
+++ b/nbs/data/AzureVisionResults/437160969.txt
@@ -0,0 +1,24 @@
+```json
+[
+ {
+ "scientificName": "Myntopsis macrocarpa Schltr.",
+ "eventDate": "1902",
+ "country": "New Caledonia",
+ "locality": "Auf den Bergen am Ngoye",
+ "minimumElevationInMeters": "1000",
+ "catalogNumber": "15198",
+ "recordedBy": "R. Schlechter",
+ "typeStatus": "TYPE",
+ "institutionCode": "Australian National Herbarium",
+ "collectionCode": "CANB",
+ "associatedReferences": "Iter Neo-Caledonicum, Plantae Schlechterianae No. 15198"
+ },
+ {
+ "scientificName": "Myrtopsis pomaderridifolia (Baill.) Guillaumin",
+ "eventDate": "1907-04-23",
+ "country": "France",
+ "institutionCode": "Herb. Muséum Paris",
+ "catalogNumber": "P00222151"
+ }
+]
+```
\ No newline at end of file
diff --git a/nbs/data/AzureVisionResults/combined_df.csv b/nbs/data/AzureVisionResults/combined_df.csv
new file mode 100644
index 0000000..acb3d06
--- /dev/null
+++ b/nbs/data/AzureVisionResults/combined_df.csv
@@ -0,0 +1,225 @@
+,ID,Taxon,Collector,Geography,rTaxon,rCollector,rGeography,Taxon_pred_Output,Taxon_Confidence_Output,Cosine_Similarity
+0,1931124118,suaeda nigra,s watson,united states of america,suaeda torreyana wats,sereno watson,united states of america,Suaeda torreyana,0.859420895576477,0.8333333333333333
+1,1426171668,albizia niopoides,jorgensen p,paraguay,withacolabium haslevi chvad,prof pedro jorgensen,paraguay,Haslea,0.38157081604003906,0.4308202184276645
+2,2512855384,clematis ladakhiana,w n koelz,india,clematis ladakhiana greywilson,walter koelz,india,Clematis ladakhiana Grey-Wilson,0.6408135294914246,0.9049125308576026
+3,1318293083,cordia sulcata,w l abbott,dominican republic,verben,w l abbott,dominican republic,,0.0,0.0
+4,3005670412,oplismenus burmanni,c robinson,indonesia,hippogrostis amboinica,,united states of america,Eragrostis amboinica,0.6971797943115234,0.6777389936698861
+5,1318526260,tripsacum maizar, collins kempton,mexico,tripsacum pilosum sim,j sánchezken,mexico,Tripsacum pilosum,1.0,0.7637626158259733
+6,1998333126,philadelphus coulteri,c a purpus,mexico,philadelphus purpusii brandegee,c c purpus,mexico,Philadelphus purpusii T.S.Brandegee,0.8526516556739807,0.8402614906718262
+7,1998550976,apocynum floribundum,c w jenks,united states of america,apocynum cannabinum l,fc seymour,united states of america,Apocynum cannabinum,1.0,0.8209008497548318
+8,1998969928,spiraea salicifolia,no data available,united states of america,spiraea salicifolia l,c a kofoid,united states of america,Spiraea salicifolia,1.0,0.9784921095801633
+9,2425414867,goodyera oblongifolia,a h armstrong,united states of america,epipartis decipiens,a h armstrong,united states of america,Epipactis decipiens,0.8492961525917053,0.3267826507127243
+10,1999167579,pseudoziziphus parryi,c c epling w robinson,united states of america,condalia parryi jorr webert,c epling wm robison,united states of america,Condalia parryi,0.6905890107154846,0.6246950475544243
+11,2575053354,sambucus racemosa,b maguire j d redd,united states of america,sambucus melanocarpa a gray,bassett maguire and j d redd,united states of america,Sambucus melanocarpa,1.0,0.8774535953713308
+12,1999283271,panicum clandestinum,kate furbish,united states of america,panicum clandestinum l,kate furbish,united states of america,,0.0,0.0
+13,3459889344,coelogyne xyrekes,a f g kerr,thailand,coelogyne zymches ride,,thailand,Coelogyne,0.658402144908905,0.820767734294955
+14,1998497875,rhus ovata,s d mckelvey,united states of america,rhus ovata wats,susan delano mckelvey,united states of america,Rhus ovata,0.7596694231033325,0.9166666666666667
+15,1999143240,antennaria neglecta,n t kidder,united states of america,,new england botanical club,united states of america,,0.0,0.0
+16,1999056693,actaea rubra,a o garrett,united states of america,actaea rubra ait willd,a o garrett,united states of america,Actaea rubra,0.664243757724762,0.9647638212377321
+17,2608673843,viburnum nudum,v l cory,united states of america,viburnum nudum l,v l cory,united states of america,Viburnum nudum,1.0,0.9666666666666667
+18,1998994775,stellaria media,l a wheeler,united states of america,stellaria media s cyrill,leston a wheeler,united states of america,Stellaria media,0.6804120540618896,0.962962962962963
+19,1998482052,silene dioica,e b harger,united states of america,lychnis vesperina tychins dioica,gh os,united states of america,,0.0,0.0
+20,1212567865,tigridia pavonia,sc,france,tigridia pavonia,antoine laurent de jussieu,france,Tigridia pavonia,1.0,0.9705882352941175
+21,2625898343,xerophyllum asphodeloides,w mcatee,united states of america,xerophyllum asphodeloides,,united states of america,Xerophyllum asphodeloides,1.0,0.980392156862745
+22,2426921679,arceuthobium americanum,p a rydberg e a bessey,united states of america,razoumofskya americana nutt kuntze,p a rydberg ernst a bessey,united states of america,Razoumofskya minuta Kuntze,0.8409343957901001,0.7273098320775918
+23,2236147388,arctostaphylos crustacea,a eastwood j t howell,united states of america,arctostaphylos crustacea eastw,john thomas howell alice eastwood,united states of america,Arctostaphylos crustacea,1.0,0.9832820049844602
+24,2452262576,stellaria cuspidata,o buchtien,chile,stellaria cuspidata willd,dr otto buchtien,chile,Stellaria cuspidata Willd.,0.880627453327179,0.9151333141298447
+25,1802596511,apodanthera undulata,e a mearns,mexico,cucurbita psis undulata a gr mcrov,edgar a mearns,united states of america,Cucurbita,0.5215805768966675,0.5222329678670936
+26,1456008930,stevia serrata,c seler e seler,mexico,isevia linoides joh bija,caec et ed seler,united states of america,Stevia linoides linoides,0.487956166267395,0.5520176648479171
+27,2571435846,acaena cylindristachya,a h alston,venezuela bolivarian republic of,acaena cylindrostachya ruiz pavon,a h g alston,venezuela,Acaena cylindrostachya Ruiz & Pav.,0.8982694745063782,0.8782753106899459
+28,2235750047,miconia calvescens,e asplund,ecuador,miconia calvescens schr mart dc,erik asplund,ecuador,Miconia calvescens,0.773187518119812,0.9705882352941175
+29,2900445104,agrostis scabra,f h knowlton,united states of america,agrostis hyemalis watson bp,f a jnorton,united states of america,Watsonius watsoni,0.4907917380332947,0.7111590022187596
+30,1456276626,solidago altissima,p c standley,united states of america,solidago canadensis var scabra,paul c standley,united states of america,Solidago canadensis scabra,0.9410185217857361,0.8103336504181472
+31,1317278320,sidalcea asprella,w w eggleston,united states of america,sidalcea asprella greene,w w eggleston,united states of america,Sidalcea,0.9999998807907104,0.8101914936669332
+32,2900436116,agrostis stolonifera,a chase,united states of america,agrostis stolonifera l,agnes chase,united states of america,Agrostis stolonifera,1.0,0.9746794344808963
+33,2565407235,crataegus mollis,b f bush,united states of america,crataegus berlandieri sarg,b f bush,united states of america,Crataegus berlandieri,0.8909734487533569,0.7325198740332296
+34,2235813242,physalis glutinosa,c g pringle,mexico,cacabus mexicanus watson,c g pringle,mexico,Cacabus mexicanus,0.8256658911705017,0.6102571532587293
+35,2625851024,potamogeton gramineus,t howell,united states of america,potamogeton gramineus linnaeus,ecogden,united states of america,Potamogeton gramineus,0.8756946921348572,0.9743589743589743
+36,2592277723,nectandra lineata,p h allen,panama,nectandra standleyi ck allen,p h allen,panamá,Nectandra standleyi,0.8061102032661438,0.9198662110077999
+37,2643353998,agalinis tenella,a h curtiss,united states of america,agalinis tenella pennell,a h curtiss,united states of america,Agalinis tenella Pennell,1.0,0.917070056253235
+38,2284193310,polygala verticillata,w w rowlee,united states of america,polygala verticillata,ww rowlee,united states of america,Polygala verticillata,1.0,0.9795918367346939
+39,2571504032,fragaria moschata,o andersen,norway,fragaria moschata,olaf andersen,norway,Fragaria moschata,1.0,0.9743589743589743
+40,2565452643,crataegus brainerdii,j haberer,united states of america,crataegus brainerdii sarg,joseph v haberer md,united states of america,Crataegus brainerdii,0.8922848701477051,0.975
+41,1928370989,cirsium muticum,m ruger,united states of america,cirsium muticum michx,,united states of america,Cirsium muticum Michx.,0.8748003840446472,0.9322783152808269
+42,2284153322,lippia brachypoda,s venturi,argentina,lippia grisebachiana mold,s venturi,argentina,Lippia grisebachiana,1.0,0.820922069065183
+43,2549491705,parnassia palustris,j macoun,canada,parnassia palustris l,john macoun,canada,Parnassia palustris,1.0,0.9793792286287206
+44,2397800089,quercus rugosa,j w toumey,united states of america,quercus reticulata,j w toumey,united states of america,Quercus reticulata,0.9999999403953552,0.7701540462154054
+45,1319864119,panicum itatiaiae,a chase,brazil,panicumitatiaiae swallen,agnes chase,brazil,Panicum itatiaiae,0.8608565926551819,0.9767441860465118
+46,1318182025,hohenbergia antillana,n l britton e g britton m s brown,puerto rico,hohenbergia antillana mez,n l britton e g britton margaret s brown,puerto rico,Hohenbergia antillana Mez,1.0,0.9440847091783918
+47,1999413720,potamogeton diversifolius,g w stevens,united states of america,potamogeton dimorphie raf,g w stevens,united states of america,,0.0,0.0
+48,2446828060,bartonia paniculata,m l fernald e b bartram b h long,canada,bartonia paniculata whichx rol,m l fernald and b long,canada,Bartonia paniculata,1.0,0.9777777777777777
+49,2012879889,fraxinus profunda,e l braun,united states of america,fraxinus profunda,e lucy braun,united states of america,Fraxinus profunda,1.0,0.9622504486493761
+50,1998481102,atriplex prostrata,kate furbish,united states of america,atriplex patula l var hastata,kate furbish,united states of america,Atriplex hastata patula,0.8220364451408386,0.8723536340342545
+51,1999047345,digitaria sanguinalis,r w woodward,united states of america,biura sanguinali l scop,r woodward,united states of america,Pyrausta sanguinalis sanguinalis,0.5578426122665405,0.8988645523557808
+52,1998758107,cirsium vulgare,a f hill,united states of america,cirsium lanceolatum l hill,a f hill,united states of america,Cirsium lanceolatum,0.8751574754714966,0.810440898473108
+53,1998465329,actaea rubra,e b chamberlain,united states of america,cetra spicata var rubra,edward b chamberlain,united states of america,Spicaria rubra,0.6272969245910645,0.8153742483272113
+54,1999317509,boechera stricta,h d house,united states of america,arabia drummondii pay,h d house,united states of america,Arabis drummondii drummondii,0.9180309176445007,0.4018071877159229
+55,1999028044,clarkia xantiana,f v coville f funston,united states of america,clarkia xantiana truy,samuel b parish,united states of america,Clarkia xantiana,0.8138471841812134,0.975
+56,2859305213,solidago patula,jennings oe jennings gk,united states of america,solidago patala mall,o e jennings gk jennings,united states of america,Solidago patula,0.8423393964767456,0.96
+57,2859014941,cladium mariscoides,dreisbach rr,united states of america,cladium mariscoides,r r dreisbach,united states of america,Cladium mariscoides,0.9999999403953552,0.9710083124552245
+58,2858981761,panicum acuminatum,berkheimer d,united states of america,panicum manuginosom var implication,david berkheimer,united states of america,Panicum lanuginosum,0.7244481444358826,0.8624886920341298
+59,2859205685,solanum carolinense,holmes kr,united states of america,solanum carolinense,katharina r holmes,united states of america,Solanum carolinense,0.9999999403953552,0.9710083124552245
+60,2235995189,rauvolfia tetraphylla,r vines,mexico,rauwolfia canescens l,robert a vines,mexico,Rauwolfia,0.8558990955352783,0.6897007348075543
+61,3092956655,chloris virgata,g r vasey,united states of america,chloris barbata,,united states of america,Chloris barbata,1.0,0.7855844048495726
+62,1318027385,cuscuta gronovii,e s steele,united states of america,cuscuta arvensis beyrich,e s steele,united states of america,Cuscuta arvensis Beyrich ex Hook.,0.8517163395881653,0.7372097807744857
+63,1321746477,miconia coniophora,e l ekman,haiti,miconia consophora urb e ekman,e l ekman,haiti,Miconia coniophora Urb. & Ekman,0.7213984131813049,0.8792249355666623
+64,3467354375,rhizomnium punctatum,h möller,sweden,nimm puntatum 2 pick,hj möller,sweden,,0.0,0.0
+65,2625852862,cistanche salsa,r c ching,china,philipaca salsa c a my,r c ching,china,,0.0,0.0
+66,2452236904,silene douglasii,j s cotton,united states of america,silene douglasii hook ssp typ,j s cotton,united states of america,Silene douglasii,1.0,0.9636241116594315
+67,2284326054,mangifera indica,p white,panama,mangifera indica l,peggy white,panama,Mangifera indica,1.0,0.9666666666666667
+68,2235755905,abuta grandifolia,a ducke,brazil,abuta grandifolia mart spruce,a ducke,brazil,Abuta grandifolia,0.7263147234916687,0.9698686309445845
+69,1318797445,drymaria cordata,h e box,antigua and barbuda,drymaria cordata l willd,harold e box,antigua british west indies,Drymaria cordata,0.7292751669883728,0.97182531580755
+70,2284189808,asemeia ovata,m tommassini,croatia,polygala nicæensis risso,f schultz m de tommassini,austria,Polygala nicaeensis Risso ex Koch & Risso,0.8018414974212646,0.6073201818814866
+71,2549496787,philadelphus microphyllus,v duran,united states of america,philadelphus microphyllus gray,victor duran,united states of america,Philadelphus microphyllus,1.0,0.9818313856641692
+72,2397779786,quercus stellata,i f tidestrom,united states of america,quercus stellata wang,ivar tidestrom,united states of america,Quercus stellata,0.8449321389198303,0.9642857142857142
+73,1319210580,phyllanthus acidus,h m curran,brazil,phyllanthus distichus muellarg,h m curran,brazil,Phyllanthus distichus,0.8016918301582336,0.9149360709564807
+74,3092906623,sporobolus clandestinus,d eyles m eyles,united states of america,sporobolus clandestinus spreng hitche,d e and m s eyles,united states of america,Sporobolus clandestinus,0.8616836667060852,0.9795522313532565
+75,2235965636,pyrola asarifolia,w forwood,united states of america,p asarifolia michx,erich haber,canada,Pyrola asarifolia Michx.,0.6097506284713745,0.9067315822017904
+76,2625876902,yucca baccata,a nelson r a nelson,united states of america,yucca arizonica mckelvey,,united states of america,Yucca arizonica McKelvey,1.0,0.7593263966019993
+77,2625904355,juncus rugulosus,h p chandler,united states of america,juncus vurgulasus engelm,hp chandler,united states of america,Juncus rugulosus Engelm.,0.537455141544342,0.9117460107822462
+78,2235956175,vaccinium erythrocarpum,w cannon,united states of america,oxy coccus erythrocarpus mula pero,w a cannon,united states of america,Coccus,0.61305832862854,0.48524747941547286
+79,2512820352,boechera perennans,j w gillespie,united states of america,arabis gracilipas quins,j w gillespie,united states of america,Arabis gracilis,0.6778637766838074,0.39557740264262
+80,2235866543,labisia pumila,rahmat si boeea,indonesia,labisia pumila bl fvill,rahmat si boeea,indonesia,Labisia pumila,0.7085047364234924,0.9636241116594315
+81,2426902656,boerhavia torreyana,j n rose w r fitch,united states of america,boerhaavia torreyana standley,j n rose wm r fitch,united states of america,Boerhavia torreyana,0.8162525296211243,0.9756097560975611
+82,2549492260,saxifraga bronchialis,p hagelbarger,united states of america,saxi fraga bronchialis wild,p r hagelbarger,united states of america,Saxifraga bronchialis bronchialis,0.8344116806983948,0.9700534376881639
+83,1318212360,cyathea squamipes,k von sneidern,colombia,cyathea caracasana var boliviensis rosenst tryon,kjell von sneidern,colombia,Cyathea caracasana boliviensis,0.7842660546302795,0.8226366627527563
+84,2848506530,stipa przewalskyi,t tang,china,štika przewalski rasher,t tang,china,Przewalskia,0.6104220747947693,0.7765802747153208
+85,3005727173,panicum aciculare,r combs,united states of america,panicum fusiforme hitchc,731,united states of america,Panicum fusiforme Hitchc.,0.9151937961578369,0.7525489793561388
+86,1455174725,polystichum acrostichoides,j b mcconnell,canada,,j b mcconnell,canada,,0.0,0.0
+87,2284154890,nemophila pulchella,l constance h l mason,united states of america,nemophila pulchella eastwood,h l mason l constance,united states of america,Nemophila pulchella Eastw.,0.7907509803771973,0.9104369629528715
+88,2900461439,poa secunda,a eastwood,united states of america,poa secunda ssp secunda var scabrella thurb soreng,,united states of america,Scabrella,0.6855131983757019,0.46153846153846156
+89,2236031445,elliottia pyroliflora,e p walker e walker,united states of america,elliottia pyroliflora bongard brim,mr and mrs ernest p walker,united states of america,Elliottia pyroliflora,0.7632536292076111,0.980392156862745
+90,1843567618,solanum jamesii,p c standley,united states of america,solanum jamesii torr,paul c standley,united states of america,Solanum jamesii Torr.,0.8655515313148499,0.834847109936722
+91,2512804326,ranunculus fascicularis,l f ward,united states of america,ranunculus fascicularis,lester j hard,united states of america,Ranunculus fascicularis,0.9999999403953552,0.9834699358669275
+92,1322958063,varronia linnaei,e c leonard,haiti,cordia polycephala lam im johnst,e c leonard,haiti,Cordia polycephala,0.6600944995880127,0.6085806194501846
+93,2512759946,clematis ligusticifolia,f w johnson,united states of america,clematis ligusticifolia nuttall,frank w johnson,united states of america,Clematis ligusticifolia var. brevifolia Nuttall,0.8730682134628296,0.90597733148574
+94,2512761763,clematis occidentalis,anselme,canada,atragene americana sims,fr anselme,canada,Atragene americana,1.0,0.6269156688215357
+95,1852124166,solanum splendens,l kenover,panama,cyphomandra heterophylla d sm,lynn bohs,panama,Cyphomandra heterophylla,0.7708359360694885,0.5276448530110863
+96,2236057326,donnellsmithia juncea,m bourgeau,mexico,donnellsmithia pensedanoides kunth msc,m bourgeau,mexico,Donnellsmithia peucedanoides,0.6633538007736206,0.9230784000933474
+97,3043554903,hemarthria compressa,j nusker,india,hemarthria compressa lf r br,d masker,india,Hemarthria compressa,0.9211224317550659,0.9746794344808963
+98,1318373170,miconia elaeagnoides,e l ekman,haiti,tetrazygia elsae agnoides pv cc,,haiti,Tetrazygia eleagnoides,0.7978360056877136,0.8606978946185855
+99,3357284466,muhlenbergia richardsonis,c f baker,united states of america,muhlenbergia richardsonis trin rydb,c f baker,united states of america,Muhlenbergia richardsonis Rydb.,0.8293401598930359,0.9305102152735633
+100,1930449245,trichilia pallida,g v nash,haiti,prichilia fallida sev,george v nash,haiti,Trichilia pallida,0.7031567096710205,0.9756097560975611
+101,1320460457,phyllanthus amarus,e c leonard,haiti,phyllanthus swarzii kostel,e c leonard,haiti,Phyllanthus swartzii Kostel.,0.7439751625061035,0.7698003589195009
+102,1702851818,palicourea elata,p c standley r torres rojas,costa rica,cephaëlis elata sw,paul c standley rubén torres r,costa rica,Cephaelis elata,0.48195576667785645,0.8534918135881966
+103,1927799942,echinopepon insularis,e palmer,mexico,echinopepon insularis,dr edward palmer,mexico,Echinopepon insularis,1.0,0.9753048303966929
+104,1928276301,peperomia muscipara,e p killip,colombia,peperomia muscipara trel,ellsworth p killip,colombia,Peperomia muscipara,0.8970407247543335,0.9743975315293801
+105,1928034398,castilleja organorum,p c standley,united states of america,castilleia integra,p c s,united states of america,Castilleja integra integra,0.7843460440635681,0.8051091587951024
+106,1998729261,amelanchier laevis,f c seymour,united states of america,amelanchier laevis wiegand,f c seymour,united states of america,Amelanchier laevis Wiegand,1.0,0.9396159594539165
+107,1998968996,platanthera flava,c h knowlton,united states of america,habemaria flava 2 gray,clarence h knowlton,united states of america,Habenaria flava,0.7641785144805908,0.8711400676249502
+108,2425445076,calopogon tuberosus,a a eaton,united states of america,calopogon tuberosus var simpsonii small magrath,a a eaton,united states of america,Calopogon tuberosus simpsonii,0.9471838474273682,0.8783100656536798
+109,1998882868,carex laxiculmis,c wright,united states of america,carex laxiculmis schweins,robert f c naczi,united states of america,Carex laxiculmis Schwein.,0.90498286485672,0.8972229068318688
+110,1999105588,scheuchzeria palustris,c h bissell,united states of america,icheuchzena palustria,c h bissell,united states of america,Scheuchzeria palustris,0.7944985032081604,0.9762210399274298
+111,1998590947,solidago uliginosa,a s pease,united states of america,solidago neglecta t g,a s pease,united states of america,Solidago neglecta,0.9999998807907104,0.7492686492653552
+112,1998818359,amelanchier utahensis,c b wolf,united states of america,amelanchier utahensis koehne,carl b wolf,united states of america,Amelanchier utahensis Koehne,0.9999999403953552,0.9280036733712484
+113,1998308840,juncus articulatus,m a day,united states of america,juncus articulatus l,m a day,united states of america,Juncus articulatus,1.0,0.9736842105263159
+114,1998543749,acer negundo,e b chamberlain,united states of america,negundo aceroides moench,e b chamberlain,united states of america,Negundo aceroides,1.0,0.8660254037844386
+115,1999410499,stellaria crassifolia,j wolf j t rothrock,united states of america,s crascifolia sinh,e bourgeau,canada,Sinapis crassifolia,0.4564482569694519,0.8797861641347265
+116,2425404585,lysimachia maritima,e f williams m l fernald,canada,glaux maritima var obtusifolia fernald,e f williams m l fernald,canada,Glaux maritima var. obtusifolia Fern.,0.836283802986145,0.7753148860571065
+117,1999314245,carex aurea,c a weatherby,united states of america,carex aurea nutt,c a weatherby,united states of america,Carex aurea Nutt.,0.8479421734809875,0.835710894037345
+118,2573054178,quercus alba,w w eggleston,united states of america,quercus alba,willard w eggleston,united states of america,Quercus alba,1.0,0.9375
+119,2236018163,flacourtia rukam,h h bartlett,indonesia,flacourtia rakam,h h bartlett,indonesia,Flacourtia,0.7532743215560913,0.8492077756084468
+120,1318393575,ditaxis argothamnoides,f w pennell,colombia,argythamnia cochensis j r johnston,francis w pennell,colombia,Argythamnia cochensis,1.0,0.7789621985578676
+121,1676047656,passiflora holosericea,c conzatti,mexico,passiflora holosericea l,prof c conzalli,mexico,Passiflora holosericea,1.0,0.9791666666666667
+122,1317746297,croton californicus,i l wiggins,mexico,croton californicus tenuis swats feng,ira l wiggins,mexico,Croton californicus tenuis,0.7522563934326172,0.9226870278438684
+123,2426886454,amaranthus palmeri,b f bush,united states of america,amaranthus balmain s wat,j d sauer,united states of america,Amaranthus,0.5294904112815857,0.8416254115301731
+124,2452230713,silene cappadocica,e bourgeau,türkiye,silene cappadocia,c bourgeon,united states of america,,0.0,0.0
+125,1426052966,castilleja arachnoidea,pennell francis whittier,united states of america,castilleja eastwoodiana pennell,francis w pennell,united states of america,Castilleja eastwoodiana Pennell,1.0,0.8598492073268774
+126,2265509546,poa pratensis,congdon j,united states of america,poa pratensis,jw congdon,united states of america,Poa pratensis,0.9999999403953552,0.9459053029269171
+127,2265485412,blephilia hirsuta,lyon h,united states of america,blephilia hirsuta pursh torr,hs lyon,united states of america,Blephilia hirsuta,0.7299878001213074,0.9655172413793104
+128,2265588715,hieracium longipilum,rosendahl c,united states of america,hieracium longipilum torrey ex hooker,co rosendahl,united states of america,Hieracium longipilum Torr.,0.8582419753074646,0.9050971106611133
+129,2265566552,cyperus retrofractus,biltmore herb,united states of america,cyperus retrofractus torr,biltmore herbarium,united states of america,,0.0,0.0
+130,1930277693,phlegmariurus linifolius,f von egger,cuba,lycopodium linifolium,eggers,cuba,Lycopodium linifolium,1.0,0.7648808633001187
+131,1929084979,peperomia heterophylla,t g yuncker,honduras,peperomia heterophylla mia,t g yuncker r f dawson h r youse,honduras,Peperomia heterophylla,0.9580529928207397,0.9800177460682513
+132,1426166249,dalea polygonoides,wright charles,united states of america,dalea polygonoides ggray,pl wright,united states of america,Dalea polygonoides,0.7291814088821411,0.968245836551854
+133,2236950433,acer spicatum,j w adams,united states of america,acer spicatum jam,j w adams,united states of america,Acer spicatum,0.9999999403953552,0.9393364366277241
+134,1998843424,sorbus americana,m l fernald,united states of america,cyrus americana marsh dc,m l fernald,united states of america,Pyrus americana,0.7743860483169556,0.8996469021204838
+135,1998543588,pilea pumila,f blanchard,united states of america,pilea pumila l gray,g blanchard m o peacham,united states of america,Pilea pumila,0.8123594522476196,0.9486832980505139
+136,2425435971,agalinis maritima,w l c muenscher o f curtis,united states of america,agalinus tenuifolia vahl raf,w c muenscher and o f curtis jr,united states of america,Agalinis tenuifolia Vahl,0.8307726979255676,0.8036143754318459
+137,2446819762,dalea lanata,g w stevens,united states of america,parosela lavata spreng brit,g w stevens,united states of america,Parosela,0.5906257033348083,0.7050239879106325
+138,1999401722,carex gynandra,c h bissell,united states of america,crinata lapas var caux dannandra schreber schrodin torrey,c h bissell,united states of america,,0.0,0.0
+139,2575086103,melampyrum lineare,biltmore herbarium,united states of america,melampyrum latifolium muhl,biltmore herbarium,united states of america,Melampyrum,1.0,0.8660254037844387
+140,1998565761,erysimum perenne,w h brewer,united states of america,erysimum arkansamum,w h brewer,united states of america,Erysimum arkansanum,0.8540704846382141,0.5766967882001441
+141,1999130856,stellaria ruscifolia,f j hermann,united states of america,stellaria umbellata turcy,tj hermann,united states of america,Stellaria umbellata,0.7676823735237122,0.7877609890611015
+142,3341239321,solidago rugosa,unknown,united states of america,impeta polita,,united states of america,Polita,0.6643518209457397,0.5499719409228704
+143,3341248414,stellaria graminea,dana w fellows,united states of america,stellaria graminea l,d w fellows,united states of america,Stellaria graminea,1.0,0.9736842105263159
+144,3341257544,populus nigra,philip dowell william t davis,united states of america,populus migra l,philip dowell,united states of america,Populus nigra,0.7881665825843811,0.9393364366277241
+145,2235847388,styrax americanus,e j palmer,united states of america,styrax americana lam,e j palmer,united states of america,Styrax americanus,0.8106234669685364,0.9622504486493761
+146,1456376042,artemisia ludoviciana,j h sandberg d t macdougal a a heller,united states of america,artemisia graphalodes kutt,i h sandberg,united states of america,,0.0,0.0
+147,1563240212,inga feuillei,o f cook g b gilbert,peru,inga feuillei m,g b gilbert o f cook,peru,Inga feuilleei,0.8412593603134155,0.9363821838346236
+148,2592238071,lemna trisulca,e c leonard,united states of america,lemna trisulca l lemna minor l,e c leonard,united states of america,Lemna trisulca,0.8293567299842834,0.9428090415820635
+149,1322630109,tilia americana,e j palmer,united states of america,tilia ifprophylla vent small,e j palmer,united states of america,Tilia macrophylla,0.6792839765548706,0.7816317493509032
+150,2848440467,lolium perenne,w n suksdorf,united states of america,lolium perenne l,,united states of america,Lolium perenne,0.9999999403953552,0.9574271077563381
+151,1702827727,chiococca alba,c l lundell,mexico,chiococca alba l hitchc,,mexico,Chiococca alba Hitchc.,0.8181087374687195,0.9396159594539165
+152,2625873744,maianthemum stellatum,a nelson,united states of america,maianthemum stellatum l link,aven nelson,united states of america,Maianthemum stellatum,0.865428626537323,0.9810847227000674
+153,1318345053,carya glabra,l f ward,united states of america,carya glabra see c ovalis,lester f ward,united states of america,Carya,1.0,0.8153742483272113
+154,1321992876,frankenia palmeri,j n rose,mexico,francenia palmeri b wat,j n rose,mexico,Frankenia palmeri,0.557654082775116,0.9677419354838711
+155,2397724211,alnus acuminata,k fiebrig,bolivia plurinational state of,aluno forallensis var opachi rgl,k fiebrig,bolivia,Malthodes allenspachi,0.26128053665161133,0.6614769048245084
+156,1317726996,clermontia persicifolia,c n forbes,united states of america,clermontia persicifolia gand,c n forbes,united states of america,Clermontia persicifolia,0.8989991545677185,0.9784921095801633
+157,2425437928,cuscuta gronovii,a s goodale m hopkins,united states of america,cuscuta polygonorum engeland,a s goodale m hopkins,united states of america,Cuscuta polygonorum Engelm.,0.8282272815704346,0.6998542122237652
+158,1999110359,xyris difformis,j w robbins,united states of america,not provided,not provided,united states of america,,0.0,0.0
+159,1998611243,crataegus iracunda,h t brown,united states of america,crataquabrathinaim,henry j brown,united states of america,Crathis,0.41706812381744385,0.5669467095138409
+160,1999032409,scirpus pedicellatus,e brainerd,united states of america,sairfus pedicellatus var fullas fernand,ezra brainerd,united states of america,Monosporus pedicellatus var. pedicellatus,0.47225654125213623,0.8998425413316951
+161,3703056529,chenopodium album,m l fernald k m wiegand h t darlington,canada,chenopodium album x,m l fernald k m wiegand,canada,Chenopodium album,1.0,0.9565217391304349
+162,1998467065,chimaphila menziesii,mrs r m austin,united states of america,chimaphila menziesii,casick organ,united states of america,Chimaphila menziesii,1.0,0.9791666666666667
+163,1998686142,calamagrostis canadensis,j r churchill,united states of america,calamagrostis langsdorffii link,jr churchill,united states of america,Calamagrostis langsdorffii,1.0,0.8525735865191388
+164,1998387136,symphyotrichum pilosum,j murdoch jr,united states of america,aster ericoides l,will smith bettink,united states of america,Aster ericoides,1.0,0.44543540318737396
+165,1998808952,crataegus pruinosa,r e horsey,united states of america,brataegus trachypola sag,r elfforse,united states of america,Crataegus brachypoda,0.6529660224914551,0.8454027929649518
+166,2575039168,ulmus crassifolia,m e jones,united states of america,prunus glandulosa t g,marous e jenes,united states of america,Prunus glandulosa,0.8454617857933044,0.7419354838709679
+167,1999154508,crataegus uniflora,c s sargent,united states of america,crataegus uniflora muenchh,,united states of america,Crataegus uniflora,0.8025756478309631,0.9642857142857142
+168,1999217275,crepis acuminata,c w sharsmith,united states of america,crepis intermedia gray apm sierrae,c w sharsmith,united states of america,Crepis intermedia,1.0,0.704469953676347
+169,1998646322,heuchera pilosissima,f j youngberg,united states of america,heuchera pilosissima f m,f youngberg,united states of america,Heuchera pilosissima,1.0,0.9733285267845754
+170,1998370401,crataegus schuettei,w h blanchard,united states of america,crbyen edame,w h blanchard,united states of america,,0.0,0.0
+171,3392122454,persicaria maculosa,f vincent,canada,persicaria l,f vincent,canada,Persicaria,0.9999999403953552,0.8406728074767075
+172,3385632342,prosthechea ochracea,o nagel,mexico,prostechea ochracea lindl w e higgins,otto nagel,mexico,Prosthechea ochracea,0.7097002267837524,0.9791666666666667
+173,2012884607,halerpestes cymbalaria,c c plitt,united states of america,ranunculus pusillus poir,cc plia,united states of america,Ranunculus pusillus Poir.,0.8819283246994019,0.44582257006028225
+174,1998980522,crataegus pruinosa,c s sargent,united states of america,crajacques primera,r koch,united states of america,,0.0,0.0
+175,2426917844,dysphania ambrosioides,e p killip a c smith,colombia,chenopodium ambrosiodes l,,colombia,,0.0,0.0
+176,2284150251,polygonum douglasii,p a rydberg a o garrett,united states of america,polygonum montanum greene,p a rydberg a o garrett,united states of america,Polygonum montanum,0.8479270935058594,0.7342172431423766
+177,1852143901,sanchezia parvibracteata,j cuatrecasas,colombia,sanchezia parvibracteata sprague,j cuatrecasas,colombia,Sanchezia parvibracteata Sprague & Hutch.,0.8837081789970398,0.9024621789837645
+178,1456250583,helenium puberulum,l ward,united states of america,helenium decurrens less vatke,i f ward,united states of america,,0.0,0.0
+179,1318334082,pilea inaequalis,w l abbott,dominican republic,pilea repens sw wedd,w l abbott,dominican republic,Pilea repens,0.6956129670143127,0.75
+180,2549483800,ribes hudsonianum,a muller,canada,rules hudsonianum rebord,adolf müller,canada,,0.0,0.0
+181,2595756978,cyperus odoratus,c f parker,united states of america,cyperus odoratus l,c f parker,united states of america,Cyperus odoratus,0.9999998807907104,0.9583333333333335
+182,1990824315,lygodesmia juncea,cs crandall,united states of america,lygodesmia juncea pursh don ex hook,prof c s crandall,united states of america,Lygodesmia juncea,0.8408058285713196,0.9523809523809526
+183,2236597761,centaurium pulchellum,e b bartsam,united states of america,centaurium pulchellum sur,e b bartram,united states of america,Centaurium pulchellum,1.0,0.9775252199076786
+184,1990809725,gentiana saponaria,walter m rankin,united states of america,gentiana saponaria l,myhough,united states of america,Gentiana saponaria,1.0,0.9782608695652174
+185,1929709919,herbertus serratus,h h rusby,bolivia plurinational state of,schisma serratum,hh rusby md,bolivia,Schisma serratum,1.0,0.7524115817860576
+186,1319326174,asclepias subverticillata,e a mearns,united states of america,asclepias subverticillata a gray vail,edgar a mearns,united states of america,Asclepias subverticillata Vail,0.8725256323814392,0.973417168333576
+187,2452332496,anredera vesicaria,g p goll o f cook g n collins,puerto rico,anredera vesicaria lam gaertner,p acevedordgz,united states of america,Anredera vesicaria,0.7500828504562378,0.978231976089037
+188,1317840733,elymus hystrix,a s hitchcock a chase,united states of america,elymus hystrix l,unknown,united states of america,Elymus hystrix,1.0,0.9444444444444446
+189,2235846679,ocotea brachybotra,m barreto,brazil,ocater brachybatra meg,mello barreto,brazil,,0.0,0.0
+190,1320104750,sida jussiaeana,p c standley,panama,sida decumbens sthil naud,paul c standley,panama,Sida decumbens,0.713459849357605,0.6374552583116766
+191,2236156450,comarostaphylis diversifolia,c b wolf,united states of america,comarostaphylis diversifolia parry greene subsp planifolia jeps g wallace,carl b wolf,united states of america,Comarostaphylis diversifolia Greene,0.912467360496521,0.9122614681554039
+192,2236024256,chaetogastra mollis,e asplund,ecuador,tibouchina mollis bonpl cogn,erik asplund,ecuador,Tibouchina mollis Cogn.,0.6603546738624573,0.6524726973924442
+193,2236116492,calystegia sepium,e o wooton,united states of america,convolvulus sepium l,,united states of america,Convolvulus sepium,1.0,0.565685424949238
+194,437160969,myrtopsis pomaderridifolia,schlechter frr,new caledonia,myrtopsis pomaderridifolia baill guillaumin,r schlechter,france,Myrtopsis pomaderridifolia (Baill.) Guillaumin,0.9107810258865356,0.8701765379549182
+195,2848503382,vulpia myuros,g r vasey,united states of america,festuca myuros linn,,united states of america,Festuca myuros,0.8947186470031738,0.7302967433402215
+196,2452293722,ranunculus hispidus,h d house,united states of america,ranunculus hispidus michaux var hispidus,h d house,united states of america,Ranunculus hispidus hispidus,0.8783635497093201,0.95480527676127
+197,1322253698,embelia pacifica,j f rock,united states of america,embelia pacifica hillebrand,j f rock,united states of america,Embelia pacifica Hillebr.,0.8719266653060913,0.9131711662921516
+198,1929883118,symphyotrichum ontarionis,h k d eggert,united states of america,aster missouriensis britten,h eggert,united states of america,Aster missouriensis Britton,0.8587844371795654,0.771964708419319
+199,1928246346,brachyotum nutans,f w pennell,peru,brachyotum nutans gleason,francis w pennell,peru,Brachyotum nutans Gleason,1.0,0.9043629472869463
+200,2397721138,quercus stellata,h oneill,united states of america,quercus stellata wang,hugh oneill,united states of america,Quercus stellata,0.8449321389198303,0.9642857142857142
+201,3005768358,panicum stapfianum,collector unknown,united states of america,panicum minus staff,mc neill mies,africa,,0.0,0.0
+202,2643351883,aphyllon uniflorum,e a mcgregor,united states of america,phyllos miflorum,e a mcgregor,united states of america,Calophyllum ramiflorum,0.6087144017219543,0.8696263565463044
+203,2452246731,claytonia rubra,a a heller,united states of america,claytonia rubra ssp rubra howell tidestrom,a a heller,united states of america,Claytonia rubra rubra,0.8291801810264587,0.934331459958054
+204,3005774322,panicum commutatum,a chase,united states of america,panicum ashei pearson,agnes chase,united states of america,Panicum ashei G.Pearson,0.8345205783843994,0.5301589619830679
+205,2284285249,stigmaphyllon bannisterioides,e c leonard g m leonard,haiti,stigmaphyllon bannisterioides l c anderson,emery c leonard genevieve m leonard,haiti,Stigmaphyllon bannisterioides,1.0,0.9841356626102458
+206,2512764119,cochlearia anglica,e mouillefarine,spain,cichliaria anglica q,e mouillefarine,spain,Cicada anglica,0.6377000212669373,0.8948929172439198
+207,2284359495,phyla cuneifolia,w over,united states of america,phyla cuneifolia torr greene,w h over,united states of america,Phyla cuneifolia Greene,0.7724546194076538,0.8263342440128467
+208,3356836015,vicia americana,e b payson l e b payson,united states of america,vicia americana muhl ex willd var americana,edwin b payson lois b payson,united states of america,Vicia americana americana,0.765712559223175,0.9711631658670145
+209,1998387245,trichostema brachiatum,a a heller,united states of america,trichostema brachiatum l,a a heller,united states of america,Trichostema brachiatum,1.0,0.9772991539767322
+210,1998956483,arctostaphylos uvaursi,j l c m victorin,canada,arctostaphylos uvaursi l spreng,frm victorin,canada,Arctostaphylos uva-ursi (L.) Spreng.,0.660009503364563,0.8357108940373449
+211,1998394637,carex scabriuscula,j t howell,united states of america,carex gigas holm mackenzie,john thomas howell,united states of america,Carex fracta Mackenzie,0.662937581539154,0.712636121415336
+212,1999253468,viola odorata,e b harger,united states of america,viola odorata l,e b harger,united states of america,Viola odorata,0.811907947063446,0.96
+213,3734813368,juncus dudleyi,n c fassett,united states of america,juncus dudleyi wieg,n c fassent,united states of america,Juncus dudleyi Wiegand,0.8469219207763672,0.8882347881956881
+214,3111515383,alloberberis trifoliolata,c s sargent,united states of america,berberis trifoliata,c s sargent,united states of america,Berberis trifoliata,1.0,0.8837194490007302
+215,2234232983,macropiper puberulum,wilkes expedition,samoa,piper macropiper latifolium,u s south pacific exploring expedition,samoa,Macropiper latifolium,0.9311297535896301,0.7610194341477616
+216,1999262308,paronychia sessiliflora,w m canby,united states of america,paronychia resiliflora mit,e bourgean,united states of america,Paronychia sessiliflora,0.8164463639259338,0.9787234042553192
+217,2900455868,cinna arundinacea,g b sudworth,united states of america,cinna arundinacea l,gb sudworth,united states of america,Cinna arundinacea,1.0,0.9775252199076786
+218,1563299852,inga nobilis,b a krukoff,brazil,inga nobilis willd,b a krukoff,brazil,Inga nobilis Willd.,0.859817385673523,0.8693182879212226
+219,2512801142,rorippa palustris,f clements,united states of america,nasturtium palustre l dc,fred clements,united states of america,Nasturtium palustre,1.0,0.7502895194085833
+220,3005755518,paspalum decumbens,h f pittier,guatemala,paspalum decumbens swartz,h pittier,guatemala,Paspalum decumbens,0.7842979431152344,0.9660917830792958
+221,1455960532,tapirira obtusa,f c hoehne,brazil,tapirira marchandii engl,f c hoehne,brasil,Tapirira marchandii Engl.,0.8488303422927856,0.7265911604027345
+222,1929881359,acer saccharum,m e wharton,united states of america,acer saccharum marsh,mary e wharton,united states of america,Acer saccharum,1.0,0.9643959372630745
+223,1928479020,cirsium mexicanum,j g jack,cuba,cirsium mexicanum dc,j g jack,cuba,Cirsium mexicanum,1.0,0.9692233691951199
diff --git a/nbs/data/AzureVisionResults/scores.png b/nbs/data/AzureVisionResults/scores.png
new file mode 100644
index 0000000..f2d1a1e
Binary files /dev/null and b/nbs/data/AzureVisionResults/scores.png differ
diff --git a/nbs/data/ChineseImages/1.jpg b/nbs/data/ChineseImages/1.jpg
new file mode 100644
index 0000000..4a0dca9
Binary files /dev/null and b/nbs/data/ChineseImages/1.jpg differ
diff --git a/nbs/data/ChineseImages/10.jpg b/nbs/data/ChineseImages/10.jpg
new file mode 100644
index 0000000..0b57710
Binary files /dev/null and b/nbs/data/ChineseImages/10.jpg differ
diff --git a/nbs/data/ChineseImages/2.jpg b/nbs/data/ChineseImages/2.jpg
new file mode 100644
index 0000000..4a0dca9
Binary files /dev/null and b/nbs/data/ChineseImages/2.jpg differ
diff --git a/nbs/data/ChineseImages/3.jpg b/nbs/data/ChineseImages/3.jpg
new file mode 100644
index 0000000..cde17f6
Binary files /dev/null and b/nbs/data/ChineseImages/3.jpg differ
diff --git a/nbs/data/ChineseImages/4.jpg b/nbs/data/ChineseImages/4.jpg
new file mode 100644
index 0000000..e7c69b9
Binary files /dev/null and b/nbs/data/ChineseImages/4.jpg differ
diff --git a/nbs/data/ChineseImages/5.jpg b/nbs/data/ChineseImages/5.jpg
new file mode 100644
index 0000000..6da7805
Binary files /dev/null and b/nbs/data/ChineseImages/5.jpg differ
diff --git a/nbs/data/ChineseImages/6.jpg b/nbs/data/ChineseImages/6.jpg
new file mode 100644
index 0000000..e53b30d
Binary files /dev/null and b/nbs/data/ChineseImages/6.jpg differ
diff --git a/nbs/data/ChineseImages/7.jpg b/nbs/data/ChineseImages/7.jpg
new file mode 100644
index 0000000..51ec3ba
Binary files /dev/null and b/nbs/data/ChineseImages/7.jpg differ
diff --git a/nbs/data/ChineseImages/8.jpg b/nbs/data/ChineseImages/8.jpg
new file mode 100644
index 0000000..b7e1dcf
Binary files /dev/null and b/nbs/data/ChineseImages/8.jpg differ
diff --git a/nbs/data/ChineseImages/9.jpg b/nbs/data/ChineseImages/9.jpg
new file mode 100644
index 0000000..e52bf03
Binary files /dev/null and b/nbs/data/ChineseImages/9.jpg differ
diff --git a/nbs/data/ChineseImages/chineseHerbarium.csv b/nbs/data/ChineseImages/chineseHerbarium.csv
new file mode 100644
index 0000000..4f08990
--- /dev/null
+++ b/nbs/data/ChineseImages/chineseHerbarium.csv
@@ -0,0 +1,38 @@
+ReferenceID,Scientific Name,Chinese Name,Identified By,Identification Date,Collector,Collector's No,Collection Date,Locality,Elevation,Habitat,Reproductive Condition
+1,Callicarpa giraldii Hesse ex Rehder,老鸦糊,林祁,20170702," 陈耀东, 马欣堂, 杜玉芬, 班勤",5008,19900707,中国 安徽省,370,山坡林中,有花有果
+2,Arisaema erubescens Schott,一把伞南星,马政旭,,,467,19710728,中国 河北省,,山沟林下,有花有果
+3,Arisaema heterophyllum Blume,天南星,林祁,20170702,,20086,19900512,中国 贵州省,790,杂木林下,有花有果
+4,Saurauia tristyla DC.,水东哥,林祁,20170702,云南二队,151,19670421,中国 云南省,800,山坡丛林,有花有果
+5,Juncus Linn, 灯心草属,林祁,20170702,榆东组,5,19680714,中国 陕西省,,,有花无果
+6,Rhus chinensis Mill.,盐肤木,林祁,20170702,李安仁,7848,19781023,中国 云南省,2000,山坡,有花有果
+7,Toxicodendron succedaneum Kuntze,野漆,林祁,20170702,付德志等,87-0677,19870513,中国 四川省,900, 沟边林内,有花无果
+8,Rhus punjabensis J.L.Stew. ex Brand. var. sinica (Diels) Rehd.,红麸杨,林祁,20170702,北京队,1169,19880606,中国 湖南省,600,山坡林内,有花有果
+9,Mallotus repandus Muell.Arg.,石岩枫,,,He Qi-guo,69-1,19790706,中国 湖北省,340,,有花有果
+10,Pistacia chinensis Bunge,黄连木,林祁,20170702,叶国栋,690,19770905,中国 福建省,30,栽培,有花有果
+11,Toxicodendron grandiflorum C.Y.Wu & T.L.Ming,大花漆,林祁,20170702,滇东北队,1190,19721001,中国 云南省,1890,山坡路边栽培,有花有果
+12,Iris halophila Pall.,喜盐鸢尾,林祁,20170702,"陈学林, 王一峰",910545,19910710,中国 甘肃省,2700,高山杂类草原,有花有果
+13,Skimmia reevesiana R.Fortune,茵芋,林祁,20170702,调查队,660476,19660423,中国 江西省,1600,山顶沟谷矮林中,有花无果
+14,Phellodendron amurense Rupr.,黄檗,林祁,20170702,金佛山考察队,385,19860528,中国 重庆市,650, 栽培,无花有果
+15,Corydalis delavayi Franch.,苍山黄堇,郝加琛,20181118,青藏队,12790,19830809,中国 四川省,3800~3900,山坡灌丛草地,有花无果
+16,Peganum nigellastrum Bunge,骆驼蒿,Mingli Zhang,20151201,,27,19630530,中国 内蒙古自治区,,,有花无果
+17,Skimmia reevesiana R.Fortune,茵芋,林祁,20170702,蓝顺彬,312,19841112,中国 云南省,2800,杂木林,有花无果
+18,Zanthoxylum armatum Druce,竹叶花椒,林祁,20170702,毛少华等,246,19620701,中国 江苏省,,庙内灌丛中,有花有果
+19,Skimmia reevesiana R.Fortune,茵芋,林祁,20170702,黄志,31123,19610910,中国 广东省,,山谷中溪边,无花有果
+20,Ptelea trifoliata Linn.,,林祁,20170702,"王文采,曹子余",150,19701100,中国 北京市,,,无花有果
+21,Halenia elliptica D.Don,椭圆叶花锚,林祁,20170702,"李全喜,赵兴存",2833,19690827,中国 四川省, 2300,林中灌丛,无花有果
+22,Murdannia Royle,水竹叶属,,,农田杂草调查组,108,19700522,中国 海南省,300,甘蔗地,有花无果
+23,Anaphalis margaritacea Benth. & Hook.f.,杠头香青, 张国进,20180401,PE西藏队,PE5838,20170903,中国 西藏自治区,3401,山坡,有花无果
+24,Microtoena moupinensis (Franch.) Prain,宝兴冠唇花,Qing Wang,20140512,赵佐成,4736,19931026,中国 四川省,2100,阳坡沟边,无花有果
+25,Colocasia gigantea Hook.f.,大野芋,刘正宇,19950921,,14771,19940904,中国 重庆市,640,溪边,有花无果
+26,Corydalis calcicola W.W.Smith, 灰岩紫堇,Magnus Liden,20181101,青藏队,3011,19810714,中国 云南省,,山坡流石滩,有花无果
+27,"Corydalis kiukiangensis C. Y. Wu, Z. Y. Su et Liden",俅江紫堇,Magnus Liden,20181101,青藏队,9832,19820903,中国 云南省,,路边草地,有花无果
+28,Corydalis balsamiflora Prain,,Magnus Liden,20170000,"张永田,郎楷永",4,19650626,中国 四川省 ,2750,山坡水边,有花无果
+29,,川百合,高云东,20190115,236-6队,2721,19760712,中国 湖北省,1600,林下,有花无果
+30,Atalantia acuminata C.C.Huang, 尖叶酒饼,林祁,20170702,王洪,1645,19920320,中国 云南省,1300,干燥石灰山,有花有果
+31,Tongoloa looloensis H.Wolff,云南东俄芹,,,CLD-90,1139,19901011,中国 云南省,3200,,有花无果
+32,Triglochin maritimum Linn.,海箭草,王忠涛, 20140403,丘明新,94,19640716,中国 宁夏回族自治区 ,,咸水潮湿地,盐土沙地,有花无果
+33,Triglochin maritimum Linn.,海韭菜,王忠涛,20140403,"赵献锳, 杨汝荣",锡-063,19800706,中国 内蒙古自治区,, 草甸, 有花无果
+34,Monochoria korsakowii Regel & Maack,雨久花,王忠涛,20170311,"吉占和, 宋书银, 王忠涛",88,19850703,中国 云南省,850,水沟边,有花无果
+35,Nitraria tangutorum Bobrov,Mingli Zhang,,20151202,M94,,19600629,中国 新疆维吾尔自治区,,,无花无果
+36,Zanthoxylum acanthopodium DC. var. timbor Hook.,毛刺花椒, 林祁,20170702,绿春队,796,19740513,中国 云南省,1700,公路旁次生灌丛,有花有
+37,Zanthoxylum armatum Druce,竹叶花椒,林祁,20170702,华南分队,2303,19891008,中国 广西壮族自治区,450,山地山谷,有花有果
diff --git a/nbs/data/ChineseResults/1.txt b/nbs/data/ChineseResults/1.txt
new file mode 100644
index 0000000..87025d9
--- /dev/null
+++ b/nbs/data/ChineseResults/1.txt
@@ -0,0 +1,32 @@
+To provide the relevant information in a JSON format according to the Darwin Core standard, I'll extract details from the text provided. Although the text provided is quite limited and partially unclear, I will put together the information as best as I can:
+
+- `Nº 1741933` is likely the specimen or catalog number.
+- `Clerodendrum japonicum (Thunb.) Sweet` appears to be what is indicated by "Caliarpa girakai," which seems like a misspelling or an old synonym for the species. As it was not possible to confirm 'Caliarpa girakai' as a recognized species name in any biodiversity database, we will use the legitimate species name for Clerodendrum japonicum.
+- `Have is Ribe.` is not clear, but it possibly refers to a location or collector, which unfortunately cannot be interpreted without further context.
+- `4 5.` is ambiguous and does not relate to any standard Darwin Core field without additional context.
+- The date `1817-7-3` represents the event date on which the specimen was collected or observed.
+
+Since we do not have explicit information about the country, based on the genus `Clerodendrum` which is native to many regions including Asia, and because there was Chinese text mentioned (though not provided), we might infer the country to be China for this entry.
+
+Here is the information in JSON format. Note that some fields remain empty ("") because the provided text does not contain this information, and I do not want to assume data which could be inaccurate.
+
+```json
+{
+ "occurrenceID": "1741933",
+ "scientificName": "Clerodendrum japonicum (Thunb.) Sweet",
+ "eventDate": "1817-07-03",
+ "country": "China",
+ "recordedBy": "",
+ "locality": "",
+ "decimalLatitude": "",
+ "decimalLongitude": "",
+ "coordinateUncertaintyInMeters": "",
+ "identificationQualifier": "",
+ "taxonRank": "",
+ "institutionCode": "",
+ "collectionCode": "",
+ "basisOfRecord": ""
+}
+```
+
+Without further information, we cannot accurately fill out the remaining Darwin Core fields. Additional context would allow for a more complete and accurate JSON representation.
\ No newline at end of file
diff --git a/nbs/data/ChineseResults/10.txt b/nbs/data/ChineseResults/10.txt
new file mode 100644
index 0000000..155d904
--- /dev/null
+++ b/nbs/data/ChineseResults/10.txt
@@ -0,0 +1,16 @@
+Based on the information provided, it seems that there might be an attempt to format data according to the Darwin Core standard, which is a framework for sharing biodiversity information. However, the information given is quite limited and lacks context. From what is provided, I will extract the bits of information that could potentially fit into Darwin Core fields. Given the term "FUICIEN" which seems to be a misspelling, I am assuming it might be "FUJIAN" — which is a province in China. Therefore, the country can be inferred as "China".
+
+Here's a JSON representation of the fragment of information based on the Darwin Core standard:
+
+```json
+{
+ "catalogNumber": "1364939",
+ "recordNumber": "1817-7-2",
+ "country": "China",
+ "fieldNumber": "174"
+}
+```
+
+Important details related to common Darwin Core fields such as `scientificName`, `decimalLatitude`, `decimalLongitude`, `eventDate`, `family`, `genus`, etc. are not present in the text snippet you have provided. If these details are available, they could be added to the JSON object to conform more completely to the Darwin Core standard.
+
+In this scenario, `catalogNumber` is used as it typically refers to the unique identifier for the specimen in a collection, `recordNumber` might represent the field record number associated with the collection event, and `fieldNumber` is assumed to be an additional identifier used in the field. However, these are educated guesses based on usual museum and herbarium practices and the limited context provided.
\ No newline at end of file
diff --git a/nbs/data/ChineseResults/2.txt b/nbs/data/ChineseResults/2.txt
new file mode 100644
index 0000000..4cc035f
--- /dev/null
+++ b/nbs/data/ChineseResults/2.txt
@@ -0,0 +1,22 @@
+The information provided is somewhat limited, but I will extract the available details and structure them according to the Darwin Core (DwC) standard to the best of my ability. The Darwin Core standard is used to share information about biological diversity by providing a stable framework of terms to facilitate the sharing of information about specimens.
+
+Here's an attempt at the JSON output with the given information; please note some fields like `country` are inferred and others like `scientificName` are derived from the text:
+
+```json
+{
+ "occurrenceID": "1741933",
+ "scientificName": "Caliarpa girakai",
+ "eventDate": "1817-07-03",
+ "country": "unknown",
+ "recordedBy": "Ribe."
+}
+```
+
+Explanation of fields:
+- `occurrenceID`: A unique identifier for the record (specimen). It seems to correspond to the number given at the start of the text.
+- `scientificName`: The scientific name of the organism. It appears to be referring to a species name in the text, though "Caliarpa girakai" does not correspond to any known species. There could be errors in the transcription, or it might be an obsolete or misspelled scientific name.
+- `eventDate`: The date of the event associated with the specimen. It is given in the format YYYY-MM-DD.
+- `country`: The country where the specimen was observed or collected. The country is not mentioned in the text, hence "unknown." If the text is partly Chinese, one might infer that the specimen could be linked to a Chinese-speaking region, but without concrete evidence, it remains speculative.
+- `recordedBy`: The person who recorded the specimen or observation. Presumably, "Ribe." is an abbreviation or surname of the recorder.
+
+Please note that the standard fields such as `locality`, `decimalLatitude`, `decimalLongitude`, `institutionCode`, `basisOfRecord`, and others are missing due to the lack of information in the provided text. If more details were available, they should be included in the JSON output as per the Darwin Core standard.
\ No newline at end of file
diff --git a/nbs/data/ChineseResults/3.txt b/nbs/data/ChineseResults/3.txt
new file mode 100644
index 0000000..58ae0be
--- /dev/null
+++ b/nbs/data/ChineseResults/3.txt
@@ -0,0 +1,20 @@
+The text you've provided includes some specimen information that may correspond to the Darwin Core standard, which is a framework designed to facilitate the sharing of information about biological diversity. However, the text is somewhat fragmented and unclear. From what can be provided, here is a JSON representation of the relevant information that could be inferred:
+
+```json
+{
+ "institutionCode": "ROPEI",
+ "collectionCode": "植物标本室",
+ "catalogNumber": "1275485",
+ "recordNumber": "44",
+ "fieldNumber": "467"
+}
+```
+
+Please note that I've made several assumptions here:
+
+- "institutionCode" is usually the code for the institution where the specimen is held. I have assumed "ROPEI" represents the institution code.
+- "collectionCode" possibly represents the specific collection within the institution and I've taken "植物标本室" which translates to "Plant Specimen Room" in English, as that code.
+- "catalogNumber" is often a unique identifier for the specimen within the collection, in this case represented by "No 1275485".
+- "recordNumber" and "fieldNumber" are used here but it's unclear what "44" and "467" represent as the text provided does not clarify their meaning within the context.
+
+The other numbers and information are difficult to place within Darwin Core elements without additional context. If you know where each piece of data belongs in the Darwin Core standard, or if you can provide additional information to properly map the data, I can update the JSON accordingly.
\ No newline at end of file
diff --git a/nbs/data/ChineseResults/4.txt b/nbs/data/ChineseResults/4.txt
new file mode 100644
index 0000000..031521f
--- /dev/null
+++ b/nbs/data/ChineseResults/4.txt
@@ -0,0 +1,29 @@
+Based on the information provided and using the Darwin Core (DwC) standard, which is a framework intended to provide a stable, robust, and flexible reference to share information about biological diversity, here's how the information could be structured in JSON format:
+
+```json
+{
+ "institutionCode": "中国科学校区用植物研究所",
+ "collectionCode": "蝶态室",
+ "catalogNumber": "0973767",
+ "scientificName": "Saarmia oistyla",
+ "recordNumber": "DK- 4",
+ "eventDate": "2073-7-1",
+ "country": "China",
+ "basisOfRecord": "PreservedSpecimen",
+ "locality": "北国"
+}
+```
+
+Explanation of fields:
+
+- `institutionCode`: The code or name of the institution where the record is held, which appears to be "中国科学校区用植物研究所" (likely "Institute of Botany, Chinese Academy of Sciences" or a similar institution).
+- `collectionCode`: The name or code denoting the collection or dataset within the institution, which is identified as "蝶态室" (butterfly room or section within the institution).
+- `catalogNumber`: A unique identifier for the record within the collection, which is "0973767".
+- `scientificName`: The scientific name of the organism in the standard format of genus followed by species, which in this case is "Saarmia oistyla".
+- `recordNumber`: A unique identifier assigned by the collector(s) or observer(s), which seems to be represented by "DK- 4".
+- `eventDate`: The date the specimen was collected or observed, which is mentioned as "2073-7-1". However, it should be noted that this future date seems incorrect or typographical error and may need to be verified.
+- `country`: The country where the specimen was collected. As the text includes Chinese characters and the institution appears to be in China, it is inferred that the country is "China".
+- `basisOfRecord`: The specific nature of the data record - a preserved specimen in this case.
+- `locality`: The specific place where the organism was found or collected, referred to as "北国" which translates to "northern country", yet this is rather vague and it's unclear exactly which locality this refers to within China.
+
+Note: The fields are based on the Darwin Core terms and some assumptions and interpretations of the text provided. Parts of the text are unclear or incomplete; for example, "15元" might refer to a price, fee, or something else entirely but does not fit within the standard Darwin Core fields. Additionally, the date "2073-7-1" appears to be an error, as it's a date in the future from the current knowledge cutoff.
\ No newline at end of file
diff --git a/nbs/data/ChineseResults/5.txt b/nbs/data/ChineseResults/5.txt
new file mode 100644
index 0000000..f148d62
--- /dev/null
+++ b/nbs/data/ChineseResults/5.txt
@@ -0,0 +1,3 @@
+As an AI language model, I can process and extract information provided I have the text to work with. Since you haven't provided the actual text containing information about a biodiversity specimen that I am supposed to process, I can't generate a Darwin Core JSON object with the relevant information.
+
+Could you please provide the text containing the details of the biodiversity specimen? With the actual specimen information, I can then help you by returning the relevant details organized according to the Darwin Core standard in JSON format.
\ No newline at end of file
diff --git a/nbs/data/ChineseResults/6.txt b/nbs/data/ChineseResults/6.txt
new file mode 100644
index 0000000..c119b84
--- /dev/null
+++ b/nbs/data/ChineseResults/6.txt
@@ -0,0 +1,18 @@
+Based on the provided text, it appears to be a record of a biological specimen but the information is incomplete and somewhat fragmented. In Darwin Core, we'd need to standardize this information into a format recognizable by biodiversity databases. Below is the information in JSON format to the best interpretation of the given text:
+
+```json
+{
+ "institutionCode": "中国药学院就和研究所",
+ "collectionCode": "植物吉本室 標本室",
+ "catalogNumber": "1213821",
+ "scientificName": "Anacardiairar",
+ "recordedBy": "M",
+ "recordNumber": "45",
+ "eventDate": "1817-07-02",
+ "country": "中国"
+}
+```
+
+Please note that I interpreted "中国药学院就和研究所 植物吉本室" as a combination of the institution code and collection code. "標本室" (specimen room) may refer to the physical location where the specimen is stored within the institution. "M" is recorded in the `recordedBy` field, possibly referring to the person or team identifier who collected the specimen, although traditionally a full name is provided. The date "1817-7-2" is formatted to ISO 8601 standard as "1817-07-02". Finally, the country is inferred as "中国" (China) from the text, and I converted "Anacardiairar" directly as it appears to be a misspelling or incomplete scientific name – possibly referring to a genus in the Anacardiaceae family.
+
+Please note that the quality of this JSON representation is entirely dependent on the accuracy and completeness of the input provided. In a real-world context, the verbatim scientific name should be verified and possibly corrected to a valid taxonomic name. Similarly, additional fields would be filled in given a more complete record, including specific locality, coordinates, and other relevant Darwin Core terms.
\ No newline at end of file
diff --git a/nbs/data/ChineseResults/7.txt b/nbs/data/ChineseResults/7.txt
new file mode 100644
index 0000000..f97fb1c
--- /dev/null
+++ b/nbs/data/ChineseResults/7.txt
@@ -0,0 +1,13 @@
+Based on the text provided, there is not sufficient information to fill out a Darwin Core record. The Darwin Core standard requires specific information about biodiversity specimens, such as the scientific name, the collector's name, the date of collection, and the location of collection, among other details.
+
+From the text given, we can infer an identifier ("Nº 1373146"), which could be considered a catalogue number or an occurrence ID. However, without additional context, it is not possible to infer the country or other relevant Darwin Core fields from the number alone.
+
+Here is a JSON representation of the limited information available, using an assumed field "catalogNumber" for the identifier:
+
+```json
+{
+ "catalogNumber": "1373146"
+}
+```
+
+Should you be able to provide additional context or details from the specimen record, I could then translate that into a more complete Darwin Core JSON structure.
\ No newline at end of file
diff --git a/nbs/data/ChineseResults/8.txt b/nbs/data/ChineseResults/8.txt
new file mode 100644
index 0000000..bc6bfe5
--- /dev/null
+++ b/nbs/data/ChineseResults/8.txt
@@ -0,0 +1,27 @@
+Without more context or a clear structure in the text, it is difficult to extract meaningful Darwin Core field information. The Darwin Core standard includes fields like scientificName, country, collectionCode, catalogNumber, etc., to share information about a biodiversity specimen. The text snippet provided appears to be fragmented and partially nonsensical, which makes it challenging to map the content accurately to Darwin Core fields.
+
+However, I'll make an attempt to parse the given text to fit Darwin Core to some extent:
+
+- "植物" translates to "plant," which might indicate that we are dealing with a plant specimen.
+- "帮茶室" could be related to a tea room or a collective involved with plants, but this is not relevant to a specimen record.
+- The "Anacardirence" might be a misspelling or an incorrect transcription; we could interpret it as an attempt to indicate the family Anacardiaceae.
+- "N! 1399359" seems to be a catalog or accession number.
+- "Kazı punjabentis karwert vas" looks like a garbled string with partial resemblance to scientific nomenclature or location descriptions. "Kazı" could potentially refer to a place or an excavation term; "Punjab" is a region in India and Pakistan; however, the rest of the string doesn't provide any coherent information and might be a transcription error.
+
+Assuming "Punjab" is the clearest geographic reference, we might infer that the country is either India or Pakistan. Nevertheless, due to the uncertainty and lack of clear data, we should be cautious with this assumption.
+
+Here is our best attempt at a JSON representation in Darwin Core format with the very limited information provided:
+
+```json
+{
+ "scientificName": "Unknown",
+ "family": "Possible Anacardiaceae",
+ "catalogNumber": "1399359",
+ "country": "India/Pakistan (inferred, not confirmed)",
+ "eventRemarks": "Text information is fragmented and not clearly related to any specific specimen. 'Kazı punjabentis karwert vas' not identifiable.",
+ "recordedBy": "Unclear, text does not specify",
+ "collectionCode": "Unclear"
+}
+```
+
+Given the fragmentary and possibly misinterpreted information in the text, this JSON structure uses placeholders where data cannot be accurately determined and includes inferred information with a disclaimer.
\ No newline at end of file
diff --git a/nbs/data/ChineseResults/9.txt b/nbs/data/ChineseResults/9.txt
new file mode 100644
index 0000000..ed29913
--- /dev/null
+++ b/nbs/data/ChineseResults/9.txt
@@ -0,0 +1,29 @@
+Based on the provided text, I will extract relevant information and present it in JSON format according to the Darwin Core standard. Unfortunately, the provided text contains fragments that are difficult to interpret accurately without additional context, but I will do my best to infer the information:
+
+```json
+{
+ "institutionCode": "中国科学院植物研究所",
+ "collectionCode": "植物标本室",
+ "year": "1979",
+ "month": "7",
+ "day": "6",
+ "catalogNumber": "49-1",
+ "stateProvince": "Hupei",
+ "locality": "Lustlan",
+ "scientificName": "Maillotus nepalensis (Willd.) Muell. - Arg.",
+ "country": "China"
+}
+```
+
+Here's the breakdown of the Darwin Core fields I've used:
+
+- **institutionCode:** The institution where the specimen is kept. Here it's assumed to be the Institute of Botany, Chinese Academy of Sciences.
+- **collectionCode:** The specific collection within the institution, presumably the Plant Specimen Room.
+- **year/month/day:** The collection date of the specimen. Here, "₦" is presumed to be a typo or encoding error and ignored.
+- **catalogNumber:** A unique identifier for the specimen in the collection; "49-1" has been inferred from the text.
+- **stateProvince:** The province where the specimen was collected; "Hupei" is an old romanization of the current "Hubei".
+- **locality:** The specific locality where the specimen was found.
+- **scientificName:** The scientific name of the specimen. The provided text has a botanical name that seems to be referencing "Mallotus nepalensis", though the text "Yepunduo" does not have a clear meaning in this context and "IFni NephorGiacese 0" is unclear. *(Note: Minor spelling adjustment was made to fit the standard botanical nomenclature format.)*
+- **country:** The country of collection, which is China (deduced from the institutional information and province).
+
+Please note that certain pieces of the provided text do not correspond to standard Darwin Core fields or are challenging to interpret without context, and as such have been omitted from the JSON output. The JSON format above represents an educated guess based on the partial and somewhat ambiguous information available.
\ No newline at end of file
diff --git a/nbs/data/CyrillicImages/1.jpg b/nbs/data/CyrillicImages/1.jpg
new file mode 100644
index 0000000..109c726
Binary files /dev/null and b/nbs/data/CyrillicImages/1.jpg differ
diff --git a/nbs/data/CyrillicImages/10.jpg b/nbs/data/CyrillicImages/10.jpg
new file mode 100644
index 0000000..034db85
Binary files /dev/null and b/nbs/data/CyrillicImages/10.jpg differ
diff --git a/nbs/data/CyrillicImages/2.jpg b/nbs/data/CyrillicImages/2.jpg
new file mode 100644
index 0000000..1f85be5
Binary files /dev/null and b/nbs/data/CyrillicImages/2.jpg differ
diff --git a/nbs/data/CyrillicImages/3.jpg b/nbs/data/CyrillicImages/3.jpg
new file mode 100644
index 0000000..d91072e
Binary files /dev/null and b/nbs/data/CyrillicImages/3.jpg differ
diff --git a/nbs/data/CyrillicImages/4.jpg b/nbs/data/CyrillicImages/4.jpg
new file mode 100644
index 0000000..9779eec
Binary files /dev/null and b/nbs/data/CyrillicImages/4.jpg differ
diff --git a/nbs/data/CyrillicImages/5.jpg b/nbs/data/CyrillicImages/5.jpg
new file mode 100644
index 0000000..d314ba9
Binary files /dev/null and b/nbs/data/CyrillicImages/5.jpg differ
diff --git a/nbs/data/CyrillicImages/6.jpg b/nbs/data/CyrillicImages/6.jpg
new file mode 100644
index 0000000..4fe14c7
Binary files /dev/null and b/nbs/data/CyrillicImages/6.jpg differ
diff --git a/nbs/data/CyrillicImages/7.jpg b/nbs/data/CyrillicImages/7.jpg
new file mode 100644
index 0000000..d89feca
Binary files /dev/null and b/nbs/data/CyrillicImages/7.jpg differ
diff --git a/nbs/data/CyrillicImages/8.jpg b/nbs/data/CyrillicImages/8.jpg
new file mode 100644
index 0000000..36de533
Binary files /dev/null and b/nbs/data/CyrillicImages/8.jpg differ
diff --git a/nbs/data/CyrillicImages/9.jpg b/nbs/data/CyrillicImages/9.jpg
new file mode 100644
index 0000000..a6286df
Binary files /dev/null and b/nbs/data/CyrillicImages/9.jpg differ
diff --git a/nbs/data/CyrillicImages/CyrillicData.csv b/nbs/data/CyrillicImages/CyrillicData.csv
new file mode 100644
index 0000000..f90044b
--- /dev/null
+++ b/nbs/data/CyrillicImages/CyrillicData.csv
@@ -0,0 +1,37 @@
+image number,Handwritten,Barcode ,scientificName,country ,recordedBy,eventDate,Label text (georgraphy and ecology),
+1,No,MW0464618,Ledum palustre L.,Russia, Вехов В.Н.,1983-07-06,"Карелия, Ругозерская губа Кандалакшского залива Белого моря. П-ов Киндо 4 кв. территории Беломорской биостанции Московского ун-та. Сосновый лес черничный",
+2,Yes,MW0001412,Pinus funebris Kom.,Russia, В. Комаров,1930-09-07,"г. Никольск-Уссурийский, долина р. Супутинки. Сосновая падь",
+3,Yes,MW0001364,"Pinus pumila Rgl.
",Russia,В. Куваев,1951-07-15,"о. 6. Томп. Верх. хр. горы, лиственничная тайга в долине притока Хандыги",
+4,Yes,MW0001309,"Pinus pumila (Pall.) Regl.
",Russia, Карев Г.И.,1930-0626,Камчатка. Козыревский совхоз = урочище Среднекамчатское,
+5,Yes,MW0001310,Pinus pumila Rgl.,Russia,В. Комаров,1909-10-10,Камчатка. У Петропавловска на Петровской горе,
+6,Yes,MW0001311,Pinus pumila (Pall.) Regel,Russia,Безайс Э.К.,1909,Камчатка. Вулкан Шивелуч. С. Камаки,
+7,Yes,MW0019858,NA,Russia,Гроссет Г.Э.,1955-5-24,"Магаданская обл. Бухта Гертнера. р. Дукча, устье",
+8,No,MW0894758,Crassulacae,Uzbekistan,И.И. Русанович,1979-6,"Узбекистан. Кашкадарьинская обл. 30-40 км к ЮВ от п. Яккабаг, 10 км к ЮВ от к-ка Ташкурган. ЮЗ отроги Гиссарского хр. Бассейн р. Кызылдарья.",
+9,No,MW0750892,Gramineae,Vietnam,"Прилепский Н. Г., А.Н. Демидова == A.N. Demidova, А.Н. Кузнецов == A.N. Kuznetsov, С.П. Кузнецова == S.P. Kuznetsova",2012-12-24,"B20. Вьетнам, провинция Лам Донг (Lam Dong), горный массив Би Дуп, к северу от станции Giang Ly. Тропический среднегорный смешанный лес с Pinus kesiya Royle ex Gordon и Ericaceae. Склон южной экспозиции, с Lycopodium и Brainea insignis == Vietnam, Lam Dong province, Bi Doup mountain massif, to the north of Giang Ly station. Tropical mid-mountain mixed forest with Pinus kesiya Royle ex Gordon and Ericaceae. Slope of south aspect, with Lycopodium and Brainea insignis",
+10,Yes,MW0895202,NA,Tajikistan,V.N. Pavlov,1965-8,Tajikistan. Барвоз,
+11,No,MW0751368,Theaceae?,Vietnam,"Демидова А. Н., Н.Г. Прилепский == N.G. Prilepsky, А.Н. Кузнецов == A.N. Kuznetsov",2012-12-24,"B20. Вьетнам, провинция Лам Донг (Lam Dong), горный массив Би Дуп, к северу от станции Giang Ly. Тропический среднегорный смешанный лес с Pinus kesiya Royle ex Gordon и Ericaceae. Склон южной экспозиции, с Lycopodium и Brainea insignis == Vietnam, Lam",
+12,No,MW0894425,Polygonaceae,Kyrgyzstan,"А. Серегин, Г. Лазьков, А. Сенников, E. von Raab-Straube, A. Szukala, М. Ганыбаева",2016-7-16,"Киргизия, Ферганская долина, флористический район ПФ (по: Лазьков, Султанова, 2011), 7,5 км к ВСВ от пос. Кызыл-Джар, долина сухоречья (сая), глинистый склон. [Район гербария:] M3",
+13,Yes,MW0584000,NA,Portugal,"A. Seregin == А. Серегин, И. Серегина == I. Seregina",2013-4-4,"Azores, Ilha de São Miguel, descent to Lagoa do Congro, forest in inner crater slope == Азорские острова, остров Сан-Мигел, спуск к оз. Лагуа-ду-Конгру, лес на внутреннем склоне кратера.",
+14,Yes,MW0895202,NA,Tajikistan,V.N. Pavlov,1965-8,Tajikistan. Барвоз,
+15,No,MW0760680,Rostraria cristata (L.) Tzvelev,Greece,A. Seregin == А. Серегин,2010-8-29,"Greece, Peloponnese, Methana Peninsula, above Vathy, along road to Megalochori, pebble bed of seasonal stream. == Греция, Пелопоннес, полуостров Метана, над д. Вати, вдоль дороги в пос. Мегалохори, галечниковое русло сезонного потока.",
+16,Yes,MW0062372,Polygonaceae,Russia,О. Червекова,1982-6,"Приморский край, Анучинский р-н, с. Старая Варваровка, совхоз ""Женьшень"".",
+17,Yes,MW0219933,Potamogeton pusillus L. ? Не Potamogeton ! (злак?),Russia,Пензенск. губ. Городищенск. у.,NA,Пензенск. губ. Городищенск. у.,"This one has two names?, no collection date"
+18,No,MW0584000,NA,Portugal,"A.. Seregin == А. Серегин, И. Серегина == I. Seregina",2013-4-4,"Azores, Ilha de São Miguel, descent to Lagoa do Congro, forest in inner crater slope == Азорские острова, остров Сан-Мигел, спуск к оз. Лагуа-ду-Конгру, лес на внутреннем склоне кратера.",
+19,No,MW0894471,Gypsophila,Kyrgyzstan,А. Серегин,2016-7-12,"Киргизия, Таласский Алатау, флористический район ЗТ (по: Лазьков, Султанова, 2011), верховья р. Чичкан (чуть выше устья р. Терсты), правый борт ущелья, склоновый луг (пастбище) со скальными выходами, зарослями шиповника и арчи. [Район гербария:]",
+20,No,MW0798769,Cruciferae sp.,Germany,A. Seregin == А. Серегин,2014-10-11,"Germany, North Rhine-Westphalia, Dortmund, Westfalenpark (E corner), compost heaps on operating yard; single plant. == Германия, Северный Рейн-Вестфалия, г. Дортмунд, Вестфаленпарк (восточный угол), земляные кучи на хоздворе; одно растение.",
+21,No,MW0895472,Compositae,Kyrgyzstan,С. Туманян,1975-7-NA,"Киргизия, Алайский хр. в районе пика Ленина. Альпийские луга.",
+22,Yes,MW0161957,Poaceae,Russia,Павлов В.,1971-7-11,"Виды, собранные на мелкоземном участке гребня гранитного массива, по правому берегу р. Чугулукка-юрюе, против озера Балык",
+23,No,MW0001301,Pinus pumila (Pallas) Regel,Russia,И.М. Красноборов,1988-7-28,"Магаданская обл., Верхне-Колымское нагорье, окр. пос. Кулу, стационар ""КОНТАКТ"". Осыпи сланцев по южному склону",
+24,Yes,MW0001307,Pinus pumila,Russia,Гроссет Г.Э.,2019-3-13,Магадан. Окрестности,
+25,Yes,MW0001309,Pinus pumila (Pall.) Regl.,Russia,Карев Г.И.,1930-6-26,Камчатка. Козыревский совхоз = урочище Среднекамчатское,
+26,Yes,MW0001310,Pinus pumila Rgl.,Russia,В. Комаров,1909-10-10,Камчатка. У Петропавловска на Петровской горе,
+27,No,MW0001253,Pinus sibirica Du Tour,Russia,Красноборов И. М.,1983-9-2,"Алтай, Катунский хр., окр. с. Нижн. Уймонский. Лиственнично-кедровый лес",
+28,No,MW0001260,Pinus sibirica Du Tour,Russia,Красноборов И.,2019-3-28,"Алтай, Кош-Агачский р-н, окр. с. Курай, Северо-Чуйские белки, северн. склон. Кедрово-лиственничный лес",
+29,Yes,MW0001200,Pinus sibirica,Russia,Б. Горелов,1947-7-12,"Читинская область, Улетовский р-н. Подножье гольца Индинского. Лиственница, ель, пихта.",
+30,Yes,MW0001203,Pinus sibirica Du Tour,Russia,Назаров М.,1949-9-1,"Иркутск. обл., Тофаларск. р-н. Саянский Государств. Заповедник. Бл. поселка Верхняя Гутара. В смешанном лесу по рч. Каменке",
+31,Yes,MW0001208,Pinus sibirica Du Tour,Russia,Назаров М. И.,1929-7-27,Тункинские гольцы. Р. Архыт в устье Хонхобоя. На гольце. В верхних пределов леса,
+32,No,MW0001170,Pinus sylvestris L.,Russia,Шведчикова Н.,1959-8-1,Алтайский заповедник. Долина реки Чулышман. Нижнее течение. Участки соснового леса,
+33,No,MW0001128,Pinus sylvestris L.,Russia,"Куваев В., Костеша О.",1972-7-3,"Читинск. обл., Каларский р-н. Песчаный ложок в дюнах у СЗ окраины Песков, по пр. бер. р. Ср. Сакукан у р. ц. Чара",
+34,Yes,MW0001118,Pinus sylvestris L.,Russia,Куваев В.,1951-9-2,"У. Алд. Сосн. бор на песч. холмах возвыш. Эльгэ, между Тандой и Алданом",
+35,No,MW0001105,Pinus sylvestris L.,Russia,"
+Петелин Д. А",1986-9-2,"Хабаровский край, Аяно-Майский р-н, водораздел рр. Алгама и Учур в 1 км к В от устья Алгамы, покатый (ок. 12°) склон северо-зап. экспозиции; лиственничник с березой и сосной бруснично-осоковый на месте гари 50-летней давности",
\ No newline at end of file
diff --git a/nbs/data/CyrillicResults/1.txt b/nbs/data/CyrillicResults/1.txt
new file mode 100644
index 0000000..000de54
--- /dev/null
+++ b/nbs/data/CyrillicResults/1.txt
@@ -0,0 +1,13 @@
+```json
+{
+ "institutionCode": "MW",
+ "catalogNumber": "0464618",
+ "scientificName": "Ledum palustre L.",
+ "country": "Russia",
+ "stateProvince": "Карелия",
+ "locality": "Ругозерская губа Кандалакшского залива Белого моря. П-ов Киндо",
+ "eventDate": "1983-07-06",
+ "recordedBy": "Вехов В.Н.",
+ "fieldNumber": "6183"
+}
+```
\ No newline at end of file
diff --git a/nbs/data/CyrillicResults/10.txt b/nbs/data/CyrillicResults/10.txt
new file mode 100644
index 0000000..f6dad3f
--- /dev/null
+++ b/nbs/data/CyrillicResults/10.txt
@@ -0,0 +1,15 @@
+Based on the text provided, we can deduce that the herbarium specimen is from the Moscow University Herbarium (Herbarium Universitatis Mosquensis - MW) and was collected in Tajikistan by a collector named V.N. Pavlov in August 1965. The catalogue or accession number for this specimen appears to be "MW 0 895 202."
+
+The JSON format according to the Darwin Core standard for biodiversity specimens with the available information would look like this:
+
+```json
+{
+ "institutionCode": "MW",
+ "catalogNumber": "MW0895202",
+ "country": "Tajikistan",
+ "recordedBy": "V.N. Pavlov",
+ "eventDate": "1965-08"
+}
+```
+
+Please note that some fields contain Cyrillic characters which have not been included because the specific names or other details that would be in Cyrillic were not provided. Additionally, the term "Burg tia tive" and "Saphy" listed in the text are not clear in this context and could not be interpreted into relevant biodiversity data without further information. If these terms represent scientific names or other pertinent data, they may need to be added to the JSON accordingly.
\ No newline at end of file
diff --git a/nbs/data/CyrillicResults/2.txt b/nbs/data/CyrillicResults/2.txt
new file mode 100644
index 0000000..bb57624
--- /dev/null
+++ b/nbs/data/CyrillicResults/2.txt
@@ -0,0 +1,22 @@
+Based on the provided text, which is clearly incomplete and mostly consists of numbers, codes, or fragments of words, it is quite challenging to extract meaningful data that aligns with the Darwin Core (DwC) standard. Darwin Core is a framework intended to facilitate the sharing of information about biological diversity by providing a reference for the terms used to describe the core biodiversity data categories.
+
+However, I will attempt to construct a JSON representation with the potential DwC fields that can be inferred:
+
+```json
+{
+ "institutionCode": "MW",
+ "catalogNumber": "0001412",
+ "scientificName": "Pinus fundbais uma",
+ "recordNumber": "1",
+ "country": "Russia"
+}
+```
+
+Explanation of fields:
+- `institutionCode`: Typically represents the code for the institution where the specimen is held; "MW" may refer to the Moscow University Herbarium (Herbarium Universitatis Mosquensis).
+- `catalogNumber`: A unique identifier for the specimen within the herbarium; assumed from "MW 0 001 412" to be "0001412" disregarding leading zeroes.
+- `scientificName`: The scientific name of the organism; inferred from "Pinus fundbais uma," though it may be a misspelling or incomplete due to the provided fragment.
+- `recordNumber`: The sequence number of the record; taken from the list "1" at the beginning, though its relevance is not clear.
+- `country`: Not directly provided, but since the herbarium information is provided in a mix of Latin and Cyrillic and refers to "Univ. Mosquensis," it is inferred that the country could be Russia.
+
+Please note that due to the sparse and unclear information, the 'country' field is inferred rather than extracted from the provided text, and the other fields are speculative best estimates based on the given data. The true Darwin Core record for this specimen would require additional, verifiable information.
\ No newline at end of file
diff --git a/nbs/data/CyrillicResults/3.txt b/nbs/data/CyrillicResults/3.txt
new file mode 100644
index 0000000..b0a3bb8
--- /dev/null
+++ b/nbs/data/CyrillicResults/3.txt
@@ -0,0 +1,18 @@
+Based on the provided text, there is not sufficient information to accurately fill in a complete JSON record according to the Darwin Core standard. The Darwin Core standard includes many fields such as `scientificName`, `locality`, `eventDate`, `recordedBy`, and more, which cannot be determined from the text snippet you've given.
+
+However, we can attempt to extract any information present and form a partial JSON record:
+
+```json
+{
+ "institutionCode": "Herbarium Univ. Mosquensis",
+ "catalogNumber": "MW 0 001 364",
+ "country": "Russia"
+}
+```
+
+Explanation of the inferred values:
+- The field `institutionCode` is inferred from "Herbarium Univ. Mosquensis" which suggests the herbarium of the University of Moscow, which would be in Russia.
+- `catalogNumber` seems to be represented by "MW 0 001 364", which typically denotes the unique identifier for the specimen in the herbarium or museum collection.
+- The `country` is not given directly, but the mention of Moscow suggests that the country is Russia.
+
+Please note that the Darwin Core standard has many required fields that cannot be populated from the provided text. For proper usage of the Darwin Core standard, more information on the specimen, its collection event, and identification would be needed. The partial JSON provided here is based solely on inference and the limited information provided.
\ No newline at end of file
diff --git a/nbs/data/CyrillicResults/4.txt b/nbs/data/CyrillicResults/4.txt
new file mode 100644
index 0000000..63c35ee
--- /dev/null
+++ b/nbs/data/CyrillicResults/4.txt
@@ -0,0 +1,15 @@
+Based on the details provided, here is the relevant information formatted into JSON according to the Darwin Core standard for biodiversity specimen. The country appears to be Russia, specifically inferred from the region "Камчатки" which is the Russian word for Kamchatka.
+
+```json
+{
+ "institutionCode": "MW",
+ "catalogNumber": "0001309",
+ "scientificName": "Pinus pumila (Pallas) Regel",
+ "locality": "Долина р. Камчатки",
+ "eventDate": "1980-06-26",
+ "recordedBy": "Г. И. Карев",
+ "country": "Russia"
+}
+```
+
+Please note that some of the text is not clearly readable or relevant for Darwin Core fields, so it has been omitted. The herbarium location is given as the Herbarium of the University of Moscow (Herbarium Univ. Mosquensis), and the "Collector's Name" (recordedBy) and "Collection Date" (eventDate) have been derived from the text that says "26 - vi 1980 %. Собрал Г. И. Карев." This is commonly understood to indicate that G. I. Karev collected the specimen on June 26, 1980. The scientific name "Pinus pumila (Pallas) Regel" has been provided as well.
\ No newline at end of file
diff --git a/nbs/data/CyrillicResults/5.txt b/nbs/data/CyrillicResults/5.txt
new file mode 100644
index 0000000..b9d4d2e
--- /dev/null
+++ b/nbs/data/CyrillicResults/5.txt
@@ -0,0 +1,31 @@
+Based on the provided text, here is the extracted relevant information in a JSON format using Darwin Core standard fields:
+
+```json
+{
+ "institutionCode": "MW",
+ "catalogNumber": "0001310",
+ "collectionCode": "Herbarium Univ. Mosquensis",
+ "scientificName": "Pinus pumila",
+ "country": "Russia",
+ "stateProvince": "Kamchatka",
+ "locality": "У Петром . Словске",
+ "eventDate": "1909",
+ "recordedBy": "V. Komarov",
+ "fieldNotes": "Итер Kamczaticum II",
+ "eventRemarks": "Камчатская Экспедиція Ф. П. Рябушинскаго"
+}
+```
+
+Due to the nature of the text and information gaps, some assumptions and inferences were made:
+- "Herbarium Univ. Mosquensis" is interpreted as the collection code.
+- "MW" is assumed to be the institution code of the Moscow University herbarium.
+- The scientific name "Pinus pumila" is provided in the text.
+- The country has been inferred to be Russia, considering "Камчатка" refers to the Kamchatka Peninsula in Russia.
+- Locality information was inferred from the text "У Петром . Словске" which could refer to a specific area in the Kamchatka Peninsula.
+- "1909" is assumed to be the year of the event (collection date).
+- "V. Komarov" is the recorded by name, likely referring to Vladimir Komarov, a Russian botanist.
+- Additional expedition information is included in 'eventRemarks', indicating the Kamchatka expedition organized by F. P. Ryabushinsky.
+- Since there's no specific date or range, only the year 1909 is included in 'eventDate'.
+- The field 'fieldNotes' includes "Iter Kamczaticum II" as this might indicate a series or part of the expedition notes.
+
+There is some uncertainty regarding the correct interpretation of some fields due to missing information and potential language barriers. This is the best interpretation based on the given text and available knowledge.
\ No newline at end of file
diff --git a/nbs/data/CyrillicResults/6.txt b/nbs/data/CyrillicResults/6.txt
new file mode 100644
index 0000000..78f898d
--- /dev/null
+++ b/nbs/data/CyrillicResults/6.txt
@@ -0,0 +1,17 @@
+```json
+{
+ "institutionCode": "MW",
+ "collectionCode": "0 001 311",
+ "catalogNumber": "110926",
+ "recordNumber": "1439a",
+ "scientificName": "Pinus pumila",
+ "identifiedBy": "V. Komarov",
+ "dateIdentified": "1909",
+ "eventDate": "1909",
+ "country": "Russia",
+ "stateProvince": "Kamchatka",
+ "expedition": "Камчатская Экспедиція Ф. П. Рябушинснаго",
+ "recordedBy": "V. Komarov",
+ "fieldNotes": "Iter Kamczaticum II"
+}
+```
\ No newline at end of file
diff --git a/nbs/data/CyrillicResults/7.txt b/nbs/data/CyrillicResults/7.txt
new file mode 100644
index 0000000..57103b8
--- /dev/null
+++ b/nbs/data/CyrillicResults/7.txt
@@ -0,0 +1,21 @@
+The text provided contains very limited information in a mix of Cyrillic, numeric and Latin characters. I will attempt to extract the relevant data in accordance to the Darwin Core (DwC) standard for a biodiversity specimen record to the best of my ability given the text provided.
+
+- `Herbarium Univ. Mosquensis` can be interpreted as the name of the herbarium where the specimen is stored, which would translate to "Herbarium of Moscow University". This indicates that the country is likely Russia or a Russian speaking country.
+- `MW 0 019 858` appears to be a catalog or accession number for the specimen.
+- The date `24-VII-1955` is likely the collection date of the specimen, formatted as day-month-year.
+- `J. Tpocent` is possibly the collector's name, though the spelling may be phonetic or incorrectly transcribed.
+
+Given the unclear text and the lack of scientific names, locations, or other typical Darwin Core fields, the resulting JSON would contain a number of null or assumed fields, filled only with the information that can be sensibly deduced:
+
+```json
+{
+ "institutionCode": "Herbarium Univ. Mosquensis",
+ "catalogNumber": "MW 0 019 858",
+ "country": "Russia",
+ "eventDate": "1955-07-24",
+ "recordedBy": "J. Tpocent",
+ "basisOfRecord": "PreservedSpecimen"
+}
+```
+
+Note that `basisOfRecord` is assumed to be `PreservedSpecimen` given the context of a herbarium which typically stores preserved plant specimens. The rest of the text does not provide clear, usable data for the JSON output. If additional reliable information could be obtained from the text, it should be included in the JSON structure following the appropriate Darwin Core terms.
\ No newline at end of file
diff --git a/nbs/data/CyrillicResults/8.txt b/nbs/data/CyrillicResults/8.txt
new file mode 100644
index 0000000..d93bb06
--- /dev/null
+++ b/nbs/data/CyrillicResults/8.txt
@@ -0,0 +1,20 @@
+```json
+{
+ "institutionCode": "MW",
+ "catalogNumber": "894 758",
+ "phylum": "Crassulacare",
+ "country": "Uzbekistan",
+ "stateProvince": "Кашкадарьинская обл.",
+ "locality": "30-40 км к юв от п. Яккабаг, 10 км к ЮК от к-ка Ташкурган. Отпоги Гиссарского хр. Бассейн р.Кызылларья",
+ "minimumElevationInMeters": "2000",
+ "maximumElevationInMeters": "2500",
+ "eventDate": "1979-06-26/1979-06-27",
+ "recordedBy": "И.И.Русанович",
+ "identifiedBy": "not provided",
+ "collectionCode": "MHA",
+ "fieldNumber": "not provided",
+ "basisOfRecord": "PreservedSpecimen"
+}
+```
+
+Please note that the field `identifiedBy` is marked as "not provided" because the relevant information was not supplied in the text. The `fieldNumber` is also marked as "not provided" because while there seems to be a number '3161', it's unclear if it refers to a field number. Some data (like the specific name of the species or the exact locality in non-Cyrillic script) are also not provided within the text supplied. The Cyrillic word "Выс." and "Alt." likely refer to "elevation" and "altitude" respectively, so the minimum and maximum elevation are inferred from these words.
\ No newline at end of file
diff --git a/nbs/data/CyrillicResults/9.txt b/nbs/data/CyrillicResults/9.txt
new file mode 100644
index 0000000..c5fe5e4
--- /dev/null
+++ b/nbs/data/CyrillicResults/9.txt
@@ -0,0 +1,21 @@
+```json
+{
+ "institutionCode": "MW",
+ "catalogNumber": "MW 0 750 892",
+ "scientificName": "Lycopodium u Brainea insignis",
+ "family": "Gramincae",
+ "country": "Vietnam",
+ "stateProvince": "Lam Dong",
+ "locality": "Bi Doup mountain massif, north of Giang Ly station",
+ "decimalLatitude": "12.18744",
+ "decimalLongitude": "108.672972",
+ "coordinateUncertaintyInMeters": "1561",
+ "habitat": "Tropical mid-mountain mixed forest with Pinus kesiya Royle ex Gordon and Ericaceae",
+ "eventDate": "2012-12-24",
+ "recordedBy": "N.G. Prilepsky, A.N. Demidova, A.N. Kuznetsov, S.P. Kuznetsova",
+ "identifiedBy": "A.N. Demidova, N.G. Prilepsky",
+ "fieldNumber": "461"
+}
+```
+
+Please note that the habitat mentioned in the text describes the environment but does not correspond exactly to a Darwin Core term. The "coordinateUncertaintyInMeters" assumes that "1561 m" refers to the elevation which is not the correct use for that term. However, due to the context provided, there is no other suitable Darwin Core term given for elevation or altitude, so this is inferred to the best of my ability. If the exact Darwin Core term for elevation becomes clear, it should be updated accordingly.
\ No newline at end of file
diff --git a/nbs/data/gt-labels/collector_gt.txt b/nbs/data/gt-labels/collector_gt.txt
new file mode 100644
index 0000000..6c9e67a
--- /dev/null
+++ b/nbs/data/gt-labels/collector_gt.txt
@@ -0,0 +1,1008 @@
+1697659851: Nazarov M. I.
+2573258025: M. Morgan [?]
+2597666444: Lortet Clémence
+1931288980: H. H. Rusby
+1930241969: R. Pinkus
+1929944910: P. Train
+1931007576: Frère Rolland-Germain
+1928514234: J. F. Poggenburg
+1928658806: J. K. Small
+1931124118: S. Watson
+1929752296: W. H. Camp
+2562899020: W. M. Whitfield
+1931255575: E. Palmer
+1929858478: W. A. Setchell
+1937505702: P. Wilson
+474656434: Baudouin, A.
+1265505301: Krukoff, B.A.
+1265483891: Mexia, Y.
+3416707560: Fulford, Margaret H.
+3416740305: von Handel Mazzetti, H.
+1426171668: Jorgensen, P.
+1802583431: J. Ball
+2512855384: W. N. Koelz
+1318293083: W. L. Abbott
+3005670412: C. Robinson
+1318526260: -. Collins & -. Kempton
+1802569032: C. G. Pringle
+1456345670: R. K. Godfrey
+1998333126: C. A. Purpus
+1998550976: C. W. Jenks
+1998969928: [no data available]
+1998473911: J. A. Cushman
+2425414867: A. H. Armstrong
+1999314904: F. G. Floyd
+1998413329: Kate Furbish
+1999026558: C. J. W. Schiede
+1998316723: M. L. Fernald
+1999167579: C. C. Epling & W. Robinson
+3356803607: J. Lunell
+2575053354: B. Maguire & J. D. Redd
+1999311542: R. C. Bean & D. White
+1999283271: Kate Furbish
+3459889344: A. F. G. Kerr
+1999006043: F. S. Collins
+1998497875: S. D. McKelvey
+1998722787: J. F. Collins
+1999143240: N. T. Kidder
+2608680770: R. G. Reeves
+1999056693: A. O. Garrett
+2608673843: V. L. Cory
+1998571450: A. E. Blewitt
+1998994775: L. A. Wheeler
+1998482052: E. B. Harger
+1999328636: C. Schweinfurth
+731408891: Camus, A.
+438153065: Fries, R.E.
+437056001: Le Guillou, M.
+437693558: Samat
+1212575663: Fournié, F.
+1212567865: s.c.
+438120086: Tuczkiewicz, D.
+438645471: Hahn, L.
+438118888: Chevalier, A.J.B.
+437639743: Coqueray, J.
+437308856: Humbert, H.
+438633225: Drège, J.F.
+439286989: Humbert, H.
+474921033: Saint-Hilaire, A. de
+667499366: Guillon, A.
+437356552: Thorel
+437659994: Eggers, H.F.A. von
+437656118: Fernald, M.L.
+436990042: Glaziou, A.
+438202601: Rodriguez, L.
+438582009: Rodriguez, L.
+438299177: Perrier de la Bâthie, H.
+437448502: Chevalier, A.J.B.
+3334581963: Koch, Lange
+2625898343: W. McAtee
+1563245392: C. O. Levine
+2426921679: P. A. Rydberg & E. A. Bessey
+2236147388: A. Eastwood & J. T. Howell
+1675972550: S. T. Blake
+2452262576: O. Buchtien
+1802596511: E. A. Mearns
+1563323313: W. Arnell
+2849254057: A. W. Ivanov
+1457812021: J.B. McFarlin
+2610882325: Roland Edgar Cooper, Arthur Kilpin Bulley, Roland Edgar Cooper, Arthur Kilpin Bulley
+1456008930: C. Seler & E. Seler
+2571435846: A. H. Alston
+1321842019: G. F. Papenfuss
+2235750047: E. Asplund
+2900445104: F. H. Knowlton
+1456276626: P. C. Standley
+3467360175: E. C. White
+1317278320: W. W. Eggleston
+2421752896: P. B. Kennedy
+1805297621: Alexander W. Evans
+1805431168: George E. Nichols
+1039025105: Charles A. Weatherby
+1038991156: Charles C. Parry
+1805292273: John K. Small
+1038924603: Charles A. Weatherby
+1805298355: Daniel C. Eaton
+1805292635: John M. Holzinger
+1038926232: Hugh S. Clark
+1805440196: Edwin Faxon
+1038967579: William R. Dudley
+1038933447: Hugh S. Clark
+2900436116: A. Chase
+2565407235: B. F. Bush
+2235813242: C. G. Pringle
+2625851024: T. Howell
+2592277723: P. H. Allen
+1675940934: G. Firmin
+1321443340: E. D. Merrill
+2643353998: A. H. Curtiss
+2284193310: W. W. Rowlee
+2571504032: O. Andersen
+1563210464: H. G. Dahlstedt
+1456213805: B. J. Pickel
+1320488541: J. G. Jack
+2565452643: J. Haberer
+1702847152: J. Lundequist
+1930574598: J. H. Barnhart
+1929244776: N. L. Britton
+1928370989: M. Ruger
+2423994521: W. R. Maxon
+2573563462: C. F. Durant
+2284153322: S. Venturi
+2549491705: J. Macoun
+2397800089: J. W. Toumey
+1456001675: F. M. Uhler
+1456143688: F. H. Knowlton
+1675930631: I. W. Clokey
+1319864119: A. Chase
+1563285661: W. N. Koelz
+1322650194: H. E. Stork
+1318182025: N. L. Britton, E. G. Britton & M. S. Brown
+3005750161: -. Combs & Rolfs
+3028978025: Kotschy,C.G.T.
+3028987457: Bornmüller,J.F.N.
+3028997059: Sudre,H.
+3029006515: Herzog,T.C.J.
+2514524961: Clemens J; Clemens MS
+2513842767: Schmidt
+2516762837: Sloff JG; Soest JL van
+2513616378: Kleinhoonte A
+2514667385: Louis JLP
+2514761846: Bolten D
+2515642096: Boldingh I
+2514528877: Bernard; Mogg AOD
+2516640138: Haviland GD
+2514723844: Herb Leeuw WC de
+2515771511: Dinn TJ
+2514102813: Poilane E
+2516419324: Kloos Jr AW
+2517526453: Buysman M
+2513862848: Baenitz CG
+2516471936: Docters van Leeuwen WM
+2517375344: Stork AL
+2513600930: Chase A
+2514362967: Hassler E
+2516836918: Poilane E
+1056069802: Everard Ferdinand im Thurn
+1638424637: Hofmann,Hermann
+1701811884: Olinus Nyhuus
+1702371088: H. Resvoll-Holmsen
+1701599986: Levi Rygg
+1702365209: J. M. Norman
+1702012844: R. E. Fridtz
+1702358167: Signe Fransrud
+1702001459: E. Jørgensen
+1702352653: E. V. Ekstrand
+1701991397: Torleiv Hannaas
+1701470291: Johs. Lid
+1701976980: Odd J. Aalen
+1702438076: Johs. Reiersen
+2005771863: E.J. Taquet
+1998773668: J. E. Dinsmore
+1998991904: A. J. Grout
+1999413720: G. W. Stevens
+1999320848: G. B. Rossbach
+2446828060: M. L. Fernald, E. B. Bartram & B. H. Long
+2012879889: E. L. Braun
+1998481102: Kate Furbish
+1999330570: C. H. Knowlton
+1999047345: R. W. Woodward
+1998758107: A. F. Hill
+1998465329: E. B. Chamberlain
+1999317509: H. D. House
+1999028044: F. V. Coville & F. Funston
+1830992236: Handel-Mazzetti,H.R.E. von
+1135439455: Heldreich,T.H.H. von & Tuntas,B.
+3052190309: Halácsy,E. von
+1584383967: Warburg,O.
+2859424505: Jennings, O.E.
+2859305213: Jennings, O.E.; Jennings, G.K.
+2859406708: Jennings, O.E.
+2859014941: Dreisbach, R.R.
+1563140142: Faith Pennebaker Mackaness
+2858981761: Berkheimer, D.
+2859042459: Millward, W.
+2859205685: Holmes, K.R.
+2382623542: Fitch, J.E.
+2515947653: Jeswiet J
+2515509875: Olden E van
+2516458084: Blaas W
+2512978024: Oborny A
+2513965497: Schultes
+2515252166: Brade AC
+2513596553: Pringle CG
+2514376754: Fiebrig K
+2516750403: Degener O; Nitta J
+1055366369: Edward Palmer
+1675890176: E. Asplund
+2235995189: R. Vines
+3092956655: G. R. Vasey
+1318027385: E. S. Steele
+1321746477: E. L. Ekman
+1836712931: C. B. Clarke
+1317865686: W. R. Maxon
+1062492619: Mildbraed,G.W.J.
+1260164220: Boris A. Krukoff
+1424543510: Font Quer, P.
+1988124288: J.K. Henry
+1988239866: John Davidson
+439241145: Thwaites, G.H.K.
+1840203964: Humbert H.
+2243263149: Leclercq A.
+1839459389: Robyns W.
+1840095043: Hendrickx F.L.
+1840405939: Tisserant C.
+1839928488: Gillet Justin
+3467294610: H. H. Bartlett
+3467354375: H. Möller
+2625852862: R. C. Ching
+2236176339: A. Ducke
+2452236904: J. S. Cotton
+1702798423: N. H. Nilsson
+1802552799: F. W. Pennell & E. P. Killip
+3001102338: V. Sirgo
+3001176019: J. Treboux
+3001081883: H. Hendrikson
+3001166637: J. Treboux
+3001179543: E. Niclasen
+2284326054: P. White
+1675819460: F. J. Hermann
+1456165342: C. Baker & -. Earle
+2549603947: W. C. Cheng
+2235755905: A. Ducke
+1318797445: H. E. Box
+2515216190: Yuncker TG; Dawson RF; Youse HR
+2514533738: Dorgelo JD
+2517047032: Stolz AF
+2514461919: Leresche LFJR
+2514515297: Steenis CGGJ van
+2515925582: Germain RGA
+2515196651: Ramos M
+2516405868: Haviland GD
+2517108376: Howe MA
+2514917088: Tsang WT
+2517171312: Kjellman FR
+2875995685: Utkin,L.
+1935938475: J. Cogolludo
+1935892381: A. Matthies
+3091199136: A. Nelson; J. F. Macbride
+2807578524: Samuel Mills Tracy
+2807261388: Ernest J. Palmer
+2807456023: R. W. Strandtmann
+2807365108: W. A. Silveus
+2807319680: Samuel Mills Tracy
+2807521521: Herbert H. Smith
+1019531437: Jungner, J.R.
+1675777378: G. W. Letterman
+2284189808: M. Tommassini
+2549496787: V. Duran
+2397779786: I. F. Tidestrom
+2284257102: E. L. Ekman
+1319210580: H. M. Curran
+3092906623: D. Eyles & M. Eyles
+2514607056: Splitgerber FL
+2513624958: Unknown
+2515905904: Wit HCD de
+2514332268: Bordère H
+2514380395: Vriese WH de
+2517280563: Mexia YEJ
+2515241297: Cantonspark Baarn
+2516406700: Langhe JE de
+2516224982: Kok Ankersmit HJ
+2515327027: Unreadable
+2235965636: W. Forwood
+2625876902: A. Nelson & R. A. Nelson
+2625904355: H. P. Chandler
+2235956175: W. Cannon
+2512820352: J. W. Gillespie
+2235866543: Rahmat Si Boeea
+2426902656: J. N. Rose & W. R. Fitch
+2573212768: Helfer J.W.
+1701276937: Anton Røstad
+1701335898: Henrik Printz
+1701231105: Ralph Tambs Lyche
+1701285949: Johannes Musæus Norman
+1701266314: Anton Røstad
+1701236914: Olav Gjærevoll
+1701287735: And. Neander
+187209281: Anton Røstad
+1701766050: Thorolf Vogt
+1937521898: W. H. Witte
+1949855671: E. L. Sturtevant
+1928175542: A. A. Heller
+1929143537: R. C. Murphy
+1930252111: H. A. Gleason
+1989131227: Jennie Shaddick & Homer C. Skeels
+1989431045: L. M. Umbach
+1988737850: Henry H. Rusby
+1989039350: R. O. Schornherst
+1989313099: Samuel H. Camp & Donna R. Camp
+2274177724: A.A. Eaton
+1988969066: S. Kuiper
+2252181535: W. S. F.
+2252151414: S. Flowers
+1977964375: Marie-Victorin, Fr.; Rolland-Germain, Fr.; Brunel, Jules F.; Rousseau, L. Zéphirin
+1057464806: A G. Bagshawe
+1057260090: Auguste Nicolas Pomel
+813379916: Gossweiler, J.
+1057532849: Nathaniel Lyon Gardner
+1056306307: Johann Maria Hildebrandt
+1057234135: Gunnar Konstantin Kjellberg
+1057553687: Jacques Samuel Blanchet
+436703955: Blanc
+2549492260: P. Hagelbarger
+1318212360: K. von Sneidern
+2848506530: T. Tang
+1321575246: W. A. Kellerman
+3005727173: R. Combs
+1321995468: C. O. Levine
+1322817564: D. L. Topping
+1675999637: E. H. Graham
+1259191398: Russell J. Seibert
+1257819684: Theodor Kotschy
+2430551392: Marcus E. Jones
+2989927287: Aven Nelson
+1455174725: J. B. McConnell
+2284154890: L. Constance & H. L. Mason
+2900461439: A. Eastwood
+2236031445: E. P. Walker & E. Walker
+1843567618: P. C. Standley
+1675973762: C. T. Mohr
+1676047230: -. Heiland
+1318477305: F. V. Coville & T. H. Kearney
+3698756098: F. Anderson
+2517167632: Unknown
+2513883304: Bakhuizen van den Brink Sr RC
+2516696767: Unreadable
+2513656631: Missbach R
+2513761556: Bünnemeijer W
+2514561032: Unknown
+2517198083: Clokey IW
+2516271128: Ule EHG
+2516743780: Bordère H
+2805014861: Schlechter,R.
+144854485: Engler,A.
+144838164: Prelinger
+864909330: Wright,C.
+144902385: Ilse
+144838161: Schulz,R.
+1987004085: Cronquist, Arthur
+1986885681: Bassett Maguire
+2512804326: L. F. Ward
+1322958063: E. C. Leonard
+2512759946: F. W. Johnson
+3696559994: Sharp, Aaron J.
+1056014429: Arthur Francis George Kerr
+436716407: Desnos, V.
+1563221199: N. Nikiphorova
+2512761763: Anselme
+1321333954: W. H. Brewer
+1931060382: W. H. Blanchard
+1929796452: F. S. Collins
+1928452768: Collector unknown
+1930778822: N. L. Britton
+1929535381: L. E. Wehmeyer
+2565969320: Guadagno,M.
+3392108313: Rigo,G.
+2562090317: Guadagno,M.
+1852124166: L. Kenover
+2236057326: M. Bourgeau
+1702754520: S. F. Blake
+3043554903: J. Nusker
+1322398916: J. K. Small
+1318373170: E. L. Ekman
+3357284466: C. F. Baker
+2236142683: H. F. Pittier
+1701792313: Axel Arrhenius
+1701852084: Axel Arrhenius
+1702340583: J. M. Norman
+1702324083: Bernt Lynge
+1702233404: Hartvig Johnsen
+900324877: O. A. Hoffstad
+1701892850: Fr. Areschoug
+1701783462: Finn Ch. Sørlye
+1701688454: H. Resvoll-Holmsen
+1702326021: Ove Dahl
+1702040261: B. Esmark
+1702265667: T. Lillefosse
+2867598821: R. E. Fridtz
+1927910520: F. S. Earle
+1929163462: N. L. Britton
+1930449245: G. V. Nash
+1928350945: T. L. Steiger
+1929317363: W. de W. Miller
+2251702912: J. G. Schaffner
+1930924401: R. C. Friesner
+2235396749: J. Clemens
+1929252363: F. O. Grover
+1927995752: Collector unknown
+438294181: Rothkegel, L.
+438824413: Lhote, H.
+439294172: Perrier de la Bâthie, H.
+667355588: Fruchard
+474752652: Deplanche, E.
+437804966: Tulasne, M.
+437667776: Poilane
+437016988: Weddell, H.A.
+437498688: Dombey, J.
+437205236: Durand, E.
+438972235: Privault, D.
+1936123239: Dihm
+1935990933: A. Eig & A. Grizi
+1320460457: E. C. Leonard
+1702851818: P. C. Standley & R. Torres Rojas
+1675788663: F. Wood
+1456419962: M. E. Jones
+3340047855: E.L. Reed
+1927799942: E. Palmer
+1930203372: K. K. Mackenzie
+1929034867: W. A. Ducke
+2012771368: K. Wercklé
+1930177611: J. Clemens
+1928904177: C. F. Baker
+1928276301: E. P. Killip
+1929455203: H. Luederwaldt
+1930554353: O. K. Stark
+1928034398: P. C. Standley
+1929153673: N. L. Britton
+1930323339: E. B. Payson
+2513816674: Unknown
+2514046501: King's collector G
+2514666766: Sloff JG
+2516864891: Koorneef J
+2513843663: Ule EHG
+2514563201: Bloembergen S
+2513556994: Parodi LR
+2517073037: Boswezen (Surinam)
+2513548374: Balansa B
+2513105864: Chase A
+2514597432: Bünnemeijer HAB
+2513686671: Hens F
+2513064820: Tauscher JA
+2516564502: Sandwith NY; Green TH
+2513517294: Weatherwax P
+2515481887: Brass LJ; Fly River Expedition of the American Musem of Natural History
+2513028120: Bilimek D
+2516037063: Bourgeau (Algeria series) E
+2514219426: Merrill ED
+3020071550: H. Precht
+3020078501: Kluge
+3019979401: Paul Lackschewits
+1930839217: W. A. Ducke
+1991428478: J. N. Rose
+1928506645: W. R. Taylor
+1930481189: J. D. King
+1981501278: E. Palmer
+1928908236: G. H. H. Tate
+3503204182: F.E. Clements; E.S. Clements
+1998729261: F. C. Seymour
+1998968996: C. H. Knowlton
+2425445076: A. A. Eaton
+1998882868: C. Wright
+1999105588: C. H. Bissell
+1998590947: A. S. Pease
+1998818359: C. B. Wolf
+1998308840: M. A. Day
+1998543749: E. B. Chamberlain
+1998975109: [no data available]
+1999410499: J. Wolf & J. T. Rothrock
+2425404585: E. F. Williams & M. L. Fernald
+1999314245: C. A. Weatherby
+2573155691: Boorman, J.L.
+2573161023: Meinshausen, K.F.
+2572960903: E. Brainerd; V. B. Baird
+2573054178: W. W. Eggleston
+2513843779: Elbert J
+2516625469: Lütjeharms WJ
+2516858404: Kerr AFG
+2513512927: Heller AA
+2514401720: Bauer
+2514946920: Unknown
+2515816966: Heldreich THH von
+2515829688: Alvarez RJ
+2514854487: Herb Hattum HJ van
+2517498400: Zenker GA
+2005528821: Georg Roth
+2005593775: Aaron J. Sharp
+2005539155: F.G. Meyer
+2005533779: Aaron J. Sharp
+1146138679: Collector(s): unknown, Eve Laeger, Carolyn L. Parker, Page Spencer, Stacy Studebaker
+1989533791: John Samples
+1989181820: C. A. Davis
+1989454858: LeRoy H. Harvey
+2236018163: H. H. Bartlett
+1317226844: P. Bartsch
+1318393575: F. W. Pennell
+1319648357: N. L. Britton, E. G. Britton & J. A. Shafer
+1676047656: C. Conzatti
+1317746297: I. L. Wiggins
+1319811635: H. A. Allard
+2426886454: B. F. Bush
+2452230713: E. Bourgeau
+1320398138: Y. Mexia
+1321991154: C. L. Pollard
+1426052966: Pennell, Francis Whittier
+2234495456: Wicker, Rassie Everton
+1257613902: Morice Vanoverbergh
+3342876306: Aven Nelson
+1804372890: Dutilly, Arthème H.
+2265509546: Congdon, J.
+2265485412: Lyon, H.
+2265588715: Rosendahl, C.
+2265566552: Biltmore Herb.
+1019752371: Coste, H.
+439633342: Gay, H.
+1927886612: E. J. Palmer
+1930277693: F. von Egger
+1929084979: T. G. Yuncker
+1931220496: A. Brown
+1929900918: J. F. A. Tonduz
+1928678486: G. H. Shull
+1930979681: R. E. Schultes
+1426166249: Wright, Charles
+2981274251: Pennell, F.W. (no. 13792)
+2981282666: Bang, M. (no. 1005)
+2236950433: J. W. Adams
+1998843424: M. L. Fernald
+1998543588: F. Blanchard
+2425435971: W. L. C. Muenscher & O. F. Curtis
+1999112557: A. Tonduz
+1998836464: E. F. Williams
+2446819762: G. W. Stevens
+1999401722: C. H. Bissell
+2575086103: Biltmore Herbarium
+1998565761: W. H. Brewer
+1999418228: C. F. Batchelder
+1999130856: F. J. Hermann
+3341239321: Unknown
+3341248414: Dana W. Fellows
+3341257544: Philip Dowell; William T. Davis
+1988903463: J. Funk
+1989179393: P.H. Eschmeyer
+1989439019: George L. Ames
+1988751949: Walter N. Koelz
+1989036412: Frederick J. Hermann
+1989326782: Charles K. Dodge
+3060557324: M. Ramos
+1091087924: W. J. Eyerdam
+1091063773: J. K. Small
+2418868230: Blake, S.T.
+2573469113: Francis Ramaley
+2573544171: Francis Ramaley; K. Richard Johnson
+2573330291: A. J. Evans
+2573328013: Francis Ramaley
+912507071: Ducke, A.
+912540960: Loscos, F.
+912565425: Deséglise, A.
+912570867: Huet du Pavillion, A.; Huet du Pavillion, E.
+1258000260: Benjamin F. Bush
+2235847388: E. J. Palmer
+1675955089: A. Eastwood & J. T. Howell
+1456376042: J. H. Sandberg, D. T. MacDougal & A. A. Heller
+1563240212: O. F. Cook & G. B. Gilbert
+2592238071: E. C. Leonard
+1322630109: E. J. Palmer
+2848440467: W. N. Suksdorf
+1503173489: E. F. Williams
+1931419278: C. G. Pringle
+2514177451: Türckheim H von
+2516011819: Meer J van der
+2514770172: Jansen P; Wachter WH; Unio
+2517266770: Tulner
+2512966787: Callier AS; Callier R
+2513883147: Lörzing JA
+2513913908: Iboet
+2514708896: Kobus JD
+1702827727: C. L. Lundell
+2625873744: A. Nelson
+1563197526: P. C. Standley
+1318345053: L. F. Ward
+1318724267: W. McAtee
+1321992876: J. N. Rose
+2397724211: K. Fiebrig
+1317726996: C. N. Forbes
+2425437928: A. S. Goodale & M. Hopkins
+1998684205: J. Wolf & J. T. Rothrock
+1999110359: J. W. Robbins
+1998611243: H. T. Brown
+1999032409: E. Brainerd
+1998322454: A. H. Moore
+1998540182: C. F. Batchelder
+3703056529: M. L. Fernald & K. M. Wiegand H. T. Darlington
+1998467065: Mrs. R. M. Austin
+1998686142: J. R. Churchill
+2446828826: A. A. L. Trécul
+1998387136: J. Murdoch, Jr.
+1998808952: R. E. Horsey
+1999236210: E. L. Ekman
+2270190585: Marcus E. Jones, A.M.
+1324416944: N. Bryhn
+1324415808: E. Ryan
+2436799917: Mueller, F.
+1144117269: Harald Lindberg
+1144117930: Knut Fægri
+1144117467: Joh. Dyring
+1144116842: T. Lillefosse
+1144116297: Jens Holmboe
+1144113124: Studentekskursjon
+1144111255: T. Lillefosse
+1144111060: T. Lillefosse
+1144111291: Joh. Dyring
+1699821443: Drège, J.F.
+1699819061: Elmer, A.D.E.
+1988899873: Isaac Holden
+1989203918: Harley H. Bartlett
+1989452156: Mary E. Wharton
+1988762889: A. Dachnowski
+1989065871: Trevor Kincaid
+1989357976: J. Macoun
+1988850469: F. Drouet
+2575039168: M. E. Jones
+1999154508: C. S. Sargent
+1999365236: J. C. Parlin
+1999217275: C. W. Sharsmith
+1998934839: E. G. Britton
+1998646322: F. J. Youngberg
+1998370401: W. H. Blanchard
+3392122454: F. Vincent
+1998927495: E. F. Williams
+3385632342: O. Nagel
+1998358368: T. S. Brandegee
+2012884607: C. C. Plitt
+1998980522: C. S. Sargent
+1978791988: Gardner, Gérard
+2516441644: Bernoulli WB
+2513632425: Perraudière HRT de la
+2513512357: Blankinship JW
+2516367754: Kreulen DJW
+2516648377: Fan CS; Li YY
+2517169218: Ducke A
+2514512541: Vanoverbergh MFJPM
+2516266693: Heller AA
+2516133079: Schlechter FRR
+2514718776: Kok Ankersmit HJ
+437963061: Licent, E.
+438953416: Schweinfurth, G.A.
+667477358: Letourneux, A.
+439392092: Glaziou, A.
+438478181: Rochebrune|de
+731352746: Bourgeau, E.
+437268052: Pierre, L.
+694333780: Charles, M.
+438057885: Vieillard, E.
+439102167: Regnell
+607734568: Balansa, B.
+438658471: Bunge, A.A. von|Bienert], [T.
+3053337231: H. and M. Dearing
+3053440922: H. and M. Dearing
+3053442572: W. H. Brewer
+2830034412: Camfield, J.H.
+1840286477: Jespersen K.
+1840055092: Laurent M.
+2985561385: Dumanskiy
+575063194: Schlechter, Friedrich Richard Rudolf
+575124657: Burse (Dr.)
+575349632: Dinsmore, John Edward
+684506968: Bonnardon, R.
+474857008: Balansa|Benjamin
+607743206: Balansa, B.
+439392892: Bové
+437895938: Claussen
+438835964: Chevalier, A.
+1843863678: O.M. Clark
+2517468164: Ducke A
+2514598419: Zenker GA
+2515398418: Sipkes C
+2516437739: Herb Oudemans
+2517885479: Valckenier Suringar J
+2516041985: Schlechter FRR
+2516118664: Buwalda P
+2517843127: Hall CJJ van
+2515897913: Moll JW
+3413669029: Bornmüller,J.F.N.
+1840413408: Greenway P.
+575194266: Meyers, Fred S. & Dinsmore, J.E.
+575233549: Winkler, Hans
+1563210251: W. N. Suksdorf
+1675861401: A. Boullu
+2426917844: E. P. Killip & A. C. Smith
+2284150251: P. A. Rydberg & A. O. Garrett
+436811558: Jolly, A.
+436830234: Lindig, A.
+462516430: Boivin
+667740521: Herzog, T.
+1936123523: W. Freiberg
+1936025558: D. Gutierrez
+1935972158: L. Aterido
+3469863708: Joseph A. Ewan
+1991424996: W. R. Maxon
+1930993286: P. A. Rydberg
+1930741763: E. L. Palmer
+1928664304: J. R. Johnston
+1931285394: J. A. Shafer
+1928825645: E. O. Wooton
+1930001164: Collector unspecified
+1931101328: H. M. Raup
+1949826956: A. F. Woods
+1931004823: Collector unknown
+1928495633: L. Constance
+1262195198: P. Train
+1424769351: J. Macoun
+2612114399: J. Steinbach
+1228454849: I. W. Clokey
+1228254314: E. F. Poeppig
+1228209830: J. S. Blanchet
+439453353: Morson
+439330078: Decary, R.
+437638258: Poilane, E.
+437538294: Chevalier, A.J.B.
+437375295: Clemens, J. & M.S.
+1269651682: Clair A. Brown; Wayne L. Lenz
+3469894549: William T. Penfound
+3469965760: P. O. Schallert
+3470019601: W. S. Connor
+3469982849: Elizabeth North
+1269720741: C. G. Pringle
+3469909240: Joseph A. Ewan
+1989305257: J. F. Collins
+1988621268: William Randolph Taylor
+1988896980: W. W. Eggleston
+1989188008: Not evident
+1989449647: F.S. Collins
+1988742851: H. H. Bartlett
+1146376618: Collector(s): Aven Nelson, Ruth A. Nelson
+1852143901: J. Cuatrecasas
+2848499425: T. H. Kearney
+1456250583: L. Ward
+2512789170: W. H. Horr
+1318334082: W. L. Abbott
+1563253027: T. Howell
+2549483800: A. Muller
+2595756978: C. F. Parker
+1990824315: C.S. Crandall
+1990825865: M.A. Chrysler
+2595747531: William H. Witte
+2236597761: E. B. Bartsam
+1990809725: Walter M. Rankin
+1228491932: G. L. Stout
+2235570434: N. L. Britton & C. F. Millspaugh
+1228315937: H. N. Patterson
+1424774269: R. Ridgway
+1228251414: E. A. Mearns
+1228306029: M. E. Peck
+1928095277: J. Reverchon
+1930650448: G. V. Nash
+1928522992: F. W. Hunnewell
+1929586920: E. P. Bicknell
+1930498220: M. E. Jones
+1929215495: G. H. French
+1928249367: C. F. Baker
+1930585001: K. K. Mackenzie
+1931007684: N. Pike
+1929709919: H. H. Rusby
+3404307303: Haradjian,M.
+2597547672: Bianor,F.
+3356432307: Rechinger,K.H.
+1230523158: Baum,H.
+1230475863: Hayne
+1230521660: Spruce,R.
+1319326174: E. A. Mearns
+2452332496: G. P. Goll, O. F. Cook & G. N. Collins
+1456402683: -. Tamandjeff
+1317840733: A. S. Hitchcock & A. Chase
+2234286160: Warner, Robert
+1638361821: Kotschy, T. (no. 159)
+1563345611: A. Eastwood
+1319596167: H. A. Allard
+2235846679: M. Barreto
+1320104750: P. C. Standley
+1990003078: Stanley A. Cain
+2451547838: C.W. Sharsmith
+3709798405: Duncan, W
+3709961181: Pyron, J; McVaugh, R
+2236156450: C. B. Wolf
+1318897823: O. D. Clark
+2236024256: E. Asplund
+2236116492: E. O. Wooton
+1839875557: Dubois L.
+1839435764: Vanden Brande P.J.J.
+1840238220: Louis J.
+1260899329: Eduard F. Poeppig
+1839619660: Collector unknown
+2243287836: Hock A.
+1839372103: Clausen P.C.D.
+1839431691: Gilbert G.C.
+1990250633: H. Garman and J. N. Rose
+438991779: Chevalier, A.J.B.
+438985152: Chevalier, A.J.B.
+437640137: Burchell, W.J.
+438313505: Drushel, J.A.
+437160969: Schlechter, F.R.R.
+437955154: Perrier de la Bâthie, H.
+2269030207: Harold St. John|F.R. Fosberg
+2848503382: G. R. Vasey
+2452293722: H. D. House
+1322253698: J. F. Rock
+1318062184: E. Sennen
+2516594183: Goethart JWC; Jongmans WJ
+2513070303: Stomps TJ
+2514386394: Fisher JL
+2516664127: Hoogenraad HR
+2515812343: Meer J van der
+2514129946: Boom BK
+2516447912: Laurellard
+2515533625: Swart JJ
+2513966458: Elmer ADE
+2514695932: Boom BK
+2515524620: Blumer JC
+2513652595: Unknown
+2516337873: Smith AC
+2516850832: Unknown
+2513051709: Sennen F
+2516684995: Unreadable
+2515661641: Dubois L
+1424540219: Pomel, A.N.
+1929883118: H. K. D. Eggert
+1931242159: E. J. Palmer
+1937515283: R. C. Friesner
+1930892685: Fr. León
+1928444218: Collector unknown
+1930777125: B. F. Saurman
+1929531338: S. Watson
+1928246346: F. W. Pennell
+1930031321: H. M. Raup
+1928816120: C. Lemos
+2884115305: C.F. Baker
+1929339638: N. B. Sanson
+1928097508: H. M. Raup
+1930341154: W. R. Maxon
+1927787608: J. N. Rose
+1928722299: L. G. Sjöstedt
+1675914558: E. Nordström
+1702837908: C. S. Fan & Y. Y. Li
+2397721138: H. O'Neill
+3005768358: Collector unknown
+2643351883: E. A. McGregor
+2452246731: A. A. Heller
+2512992091: Excursie Stomps
+2513976485: Cléonique
+2516495194: Vulpius
+2517392950: Winckel WF
+2513611146: Soest JL van
+2514509647: Clemens MS
+2516970560: Lambertye L de
+2517893930: Mogg AOD
+2515754360: Unknown
+2514847693: Herb Posthumus L
+2512991486: Stomps TJ
+437201612: Lemarié
+438825081: Glaziou, A.
+438974427: Przewalski, N.M.
+437492281: Tisserant, C.
+694707666: Prater
+2243247480: Lierneux N.
+1839457423: Luja P.E.
+1839899251: De Giorgi S.
+2651317899: Sandberg, J.H.; Leiberg, J.B.
+2807245356: Barton H. Warnock
+474761544: Sennen
+474815805: Puiggari
+437420788: Pentland, J.B.
+437378615: Segret, L.
+438211454: Wright
+438902394: Gay, C.
+437612824: Huet, A.
+437656942: Drège, J.F.
+437868891: Mandon, G.
+439310574: Grelet, L.J.
+2430261821: Theodore Payne
+1258695135: Elias Nelson
+1258765954: Cyrus G. Pringle
+3016516488: H. Hiir
+3016541682: R. Wiren
+3016568853: K. Eichwald
+3016596549: Joh. Mikutowicz
+439395736: Rugel
+1839476976: Pringle C.G.
+2243240526: Scaetta H.
+1839421989: Spruce R.
+1839459560: Homblé H.
+2430118223: Ora M. Clark
+2430412350: C. L. Hitchcock; J. S. Martin
+3005774322: A. Chase
+2284285249: E. C. Leonard & G. M. Leonard
+2512764119: E. Mouillefarine
+3042814259: Smirnov V.I.
+2514810679: Wakker JH
+2514056183: Elmer ADE
+2516750642: Ooststroom SJ van
+2513614211: Gleason Jr HA
+2517250185: Kobus JD
+2514618329: Hohenacker RF
+2513506176: Sandwith CI; Sandwith NY
+2517302912: Boldingh I
+2513520338: Potts G
+2562055936: Пихтина М.
+1322099762: E. L. Ekman
+3467359925: -. Puget
+2284359495: W. Over
+2397716929: W. R. Maxon & A. D. Harvey
+1260578109: Joseph N. Rose
+3125034214: Frank F. Gander
+3356836015: E. B. Payson & L. E. B. Payson
+1998387245: A. A. Heller
+1999237355: J. C. Parlin
+1998956483: J. L. C. M. Victorin
+3356834058: D. Potter
+1998394637: J. T. Howell
+1999253468: E. B. Harger
+3734813368: N. C. Fassett
+3111515383: C. S. Sargent
+2234232983: Wilkes Expedition
+1999262308: W. M. Canby
+1998974785: J. D. Culbertson
+1998693120: E. W. Sinnott
+1701973524: Askell Røskeland
+1702263060: Magnus Frostad
+1702235740: Johan Erikson
+1702357013: O. Hammar
+1702425560: Ove Dahl
+1702254438: O. G. Blomberg
+1701477809: Jens Holmboe|Johannes Lid
+2867548082: S. K. Selland
+1701706120: J. E. Thomle
+1702237871: Gösta Jönsson
+1702359234: Joh. Dyring
+1701896120: R. E. Fridtz
+1702311724: R. E. Fridtz
+3115500317: Otto Degener|Emilio Ordoñez
+2900455868: G. B. Sudworth
+1675796841: M. L. Grant
+1563299852: B. A. Krukoff
+2512801142: F. Clements
+3005755518: H. F. Pittier
+1455960532: F. C. Hoehne
+2612111434: E. W. Nelson
+1228212556: E. P. Sheldon
+1424774292: H. N. Patterson
+2612115591: C. H. T. Townsend
+1228276754: C. A. Davis
+1228476543: Mello Barreto
+1228576989: D. C. Peattie
+1823454475: W. J. Eyerdam
+1701729979: Mrs. R. M. Austin
+1928881910: Lemmon Herbarium
+1931047540: A. T. Beals
+1929881359: M. E. Wharton
+1928479020: J. G. Jack
+1930924863: P. E. E. Sintenis
+1929472083: L. M. Umbach
+2447314594: F. W. Hunnewell
+1930491842: F. Tweedy
+1929372145: P. E. E. Sintenis
+1928581487: A. A. Heller
+1949839092: A. A. Heller
+1930729559: T. W. Edmondson
+2597502209: Y. W. Taam
+1990707954: D. L. Topping
+1500125084: P. Wilson
+1500170486: C. G. Pringle
+2513603274: Kievits DBJ
+2514462780: Meyers FS; Dinsmore JE
+2514481577: Smith HH
+2517671288: Hansen (Lars) L
+2516176063: Gravet PJF
+2513046013: Schlechter FRR
+2517838221: Brandhorst AL
+2516233977: Sarip
+2514398234: Sintenis PEE
+2517101413: Lagerheim NG von
+2515134729: Bulnheim O
+2514464378: Bakhuizen van den Brink Sr RC
+2515744959: Venema HJ
+2514086404: Beumée JGB
+2514242786: Waterhouse JHL
+2517240779: Heldreich THH von
+2515006045: Schiffner VF
+2516807990: Sande Lacoste CM van der
+2516660906: Unknown
diff --git a/nbs/data/gt-labels/geography_gt.txt b/nbs/data/gt-labels/geography_gt.txt
new file mode 100644
index 0000000..aa88403
--- /dev/null
+++ b/nbs/data/gt-labels/geography_gt.txt
@@ -0,0 +1,1008 @@
+1697659851: Russian Federation
+2573258025: United States of America
+2597666444: France
+1931288980: United States of America
+1930241969: United States of America
+1929944910: United States of America
+1931007576: Canada
+1928514234: United States of America
+1928658806: United States of America
+1931124118: United States of America
+1929752296: United States of America
+2562899020: United States of America
+1931255575: United States of America
+1929858478: United States of America
+1937505702: United States of America
+474656434: Australia
+1265505301: Brazil
+1265483891: Brazil
+3416707560: United States of America
+3416740305: Greece
+1426171668: Paraguay
+1802583431: France
+2512855384: India
+1318293083: Dominican Republic
+3005670412: Indonesia
+1318526260: Mexico
+1802569032: Mexico
+1456345670: United States of America
+1998333126: Mexico
+1998550976: United States of America
+1998969928: United States of America
+1998473911: United States of America
+2425414867: United States of America
+1999314904: United States of America
+1998413329: United States of America
+1999026558: Mexico
+1998316723: United States of America
+1999167579: United States of America
+3356803607: United States of America
+2575053354: United States of America
+1999311542: United States of America
+1999283271: United States of America
+3459889344: Thailand
+1999006043: United States of America
+1998497875: United States of America
+1998722787: Canada
+1999143240: United States of America
+2608680770: United States of America
+1999056693: United States of America
+2608673843: United States of America
+1998571450: United States of America
+1998994775: United States of America
+1998482052: United States of America
+1999328636: United States of America
+731408891: France
+438153065: Argentina
+437056001: Australia
+437693558: France
+1212575663: France
+1212567865: France
+438120086: France
+438645471: Martinique
+438118888: Côte d’Ivoire
+437639743: Belgium
+437308856: Madagascar
+438633225: South Africa
+439286989: Madagascar
+474921033: Brazil
+667499366: France
+437356552: Viet Nam
+437659994: Cuba
+437656118: United States of America
+436990042: Brazil
+438202601: Guadeloupe
+438582009: Guadeloupe
+438299177: Madagascar
+437448502: Senegal
+3334581963: Greenland
+2625898343: United States of America
+1563245392: China
+2426921679: United States of America
+2236147388: United States of America
+1675972550: Australia
+2452262576: Chile
+1802596511: Mexico
+1563323313: Norway
+2849254057: Russian Federation
+1457812021: United States of America
+2610882325: Bhutan
+1456008930: Mexico
+2571435846: Venezuela (Bolivarian Republic of)
+1321842019: Jamaica
+2235750047: Ecuador
+2900445104: United States of America
+1456276626: United States of America
+3467360175: United States of America
+1317278320: United States of America
+2421752896: United States of America
+1805297621: Canada
+1805431168: United States of America
+1039025105: United States of America
+1038991156: United States of America
+1805292273: United States of America
+1038924603: United States of America
+1805298355: United States of America
+1805292635: United States of America
+1038926232: United States of America
+1805440196: United States of America
+1038967579: United States of America
+1038933447: United States of America
+2900436116: United States of America
+2565407235: United States of America
+2235813242: Mexico
+2625851024: United States of America
+2592277723: Panama
+1675940934: Ecuador
+1321443340: Philippines
+2643353998: United States of America
+2284193310: United States of America
+2571504032: Norway
+1563210464: Sweden
+1456213805: Brazil
+1320488541: Cuba
+2565452643: United States of America
+1702847152: Sweden
+1930574598: United States of America
+1929244776: Puerto Rico
+1928370989: United States of America
+2423994521: Panama
+2573563462: United States of America
+2284153322: Argentina
+2549491705: Canada
+2397800089: United States of America
+1456001675: United States of America
+1456143688: United States of America
+1675930631: United States of America
+1319864119: Brazil
+1563285661: India
+1322650194: Costa Rica
+1318182025: Puerto Rico
+3005750161: United States of America
+3028978025: Iran (Islamic Republic of)
+3028987457: Iraq
+3028997059: France
+3029006515: Bolivia (Plurinational State of)
+2514524961: Malaysia
+2513842767: Switzerland
+2516762837: Netherlands
+2513616378: South Africa
+2514667385: Congo, Democratic Republic of the
+2514761846: Netherlands
+2515642096: Sint Maarten (Dutch part)
+2514528877: South Africa
+2516640138: Malaysia
+2514723844: Netherlands
+2515771511: Spain
+2514102813: Viet Nam
+2516419324: Netherlands
+2517526453: unknown or invalid
+2513862848: unknown or invalid
+2516471936: Indonesia
+2517375344: unknown or invalid
+2513600930: United States of America
+2514362967: Paraguay
+2516836918: Cambodia
+1056069802: Fiji
+1638424637: Germany
+1701811884: Norway
+1702371088: Norway
+1701599986: Svalbard and Jan Mayen
+1702365209: Norway
+1702012844: Norway
+1702358167: Norway
+1702001459: Norway
+1702352653: Sweden
+1701991397: Norway
+1701470291: Norway
+1701976980: Norway
+1702438076: Norway
+2005771863: Korea, Republic of
+1998773668: United States of America
+1998991904: United States of America
+1999413720: United States of America
+1999320848: United States of America
+2446828060: Canada
+2012879889: United States of America
+1998481102: United States of America
+1999330570: United States of America
+1999047345: United States of America
+1998758107: United States of America
+1998465329: United States of America
+1999317509: United States of America
+1999028044: United States of America
+1830992236: China
+1135439455: Greece
+3052190309: Greece
+1584383967: China
+2859424505: United States of America
+2859305213: United States of America
+2859406708: United States of America
+2859014941: United States of America
+1563140142: United States of America
+2858981761: United States of America
+2859042459: United States of America
+2859205685: United States of America
+2382623542: United States of America
+2515947653: Belgium
+2515509875: unknown or invalid
+2516458084: Netherlands
+2512978024: unknown or invalid
+2513965497: unknown or invalid
+2515252166: Brazil
+2513596553: Mexico
+2514376754: Bolivia (Plurinational State of)
+2516750403: United States of America
+1055366369: Mexico
+1675890176: Norway
+2235995189: Mexico
+3092956655: United States of America
+1318027385: United States of America
+1321746477: Haiti
+1836712931: India
+1317865686: Panama
+1062492619: Rwanda
+1260164220: Brazil
+1424543510: Morocco
+1988124288: Canada
+1988239866: Canada
+439241145: Sri Lanka
+1840203964: Congo, Democratic Republic of the
+2243263149: Mali
+1839459389: Congo, Democratic Republic of the
+1840095043: Congo, Democratic Republic of the
+1840405939: Central African Republic
+1839928488: Congo, Democratic Republic of the
+3467294610: Belize
+3467354375: Sweden
+2625852862: China
+2236176339: Brazil
+2452236904: United States of America
+1702798423: Russian Federation
+1802552799: Colombia
+3001102338: Estonia
+3001176019: Ukraine
+3001081883: Estonia
+3001166637: Estonia
+3001179543: Estonia
+2284326054: Panama
+1675819460: United States of America
+1456165342: United States of America
+2549603947: China
+2235755905: Brazil
+1318797445: Antigua and Barbuda
+2515216190: Honduras
+2514533738: Indonesia
+2517047032: Tanzania, United Republic of
+2514461919: unknown or invalid
+2514515297: Indonesia
+2515925582: Congo, Democratic Republic of the
+2515196651: Philippines
+2516405868: Malaysia
+2517108376: United States of America
+2514917088: China
+2517171312: Norway
+2875995685: Armenia
+1935938475: Spain
+1935892381: Germany
+3091199136: United States of America
+2807578524: United States of America
+2807261388: United States of America
+2807456023: United States of America
+2807365108: United States of America
+2807319680: United States of America
+2807521521: Colombia
+1019531437: Cameroon
+1675777378: United States of America
+2284189808: Croatia
+2549496787: United States of America
+2397779786: United States of America
+2284257102: Cuba
+1319210580: Brazil
+3092906623: United States of America
+2514607056: unknown or invalid
+2513624958: unknown or invalid
+2515905904: South Africa
+2514332268: France
+2514380395: Indonesia
+2517280563: Ecuador
+2515241297: unknown or invalid
+2516406700: Belgium
+2516224982: Netherlands
+2515327027: Brazil
+2235965636: United States of America
+2625876902: United States of America
+2625904355: United States of America
+2235956175: United States of America
+2512820352: United States of America
+2235866543: Indonesia
+2426902656: United States of America
+2573212768: India
+1701276937: Norway
+1701335898: Norway
+1701231105: Norway
+1701285949: Norway
+1701266314: Norway
+1701236914: Norway
+1701287735: Sweden
+187209281: Sweden
+1701766050: Svalbard and Jan Mayen
+1937521898: United States of America
+1949855671: United States of America
+1928175542: Puerto Rico
+1929143537: United States of America
+1930252111: Puerto Rico
+1989131227: United States of America
+1989431045: United States of America
+1988737850: United States of America
+1989039350: United States of America
+1989313099: United States of America
+2274177724: United States of America
+1988969066: United States of America
+2252181535: United States of America
+2252151414: United States of America
+1977964375: Canada
+1057464806: Uganda
+1057260090: Algeria
+813379916: Angola
+1057532849: United States of America
+1056306307: Madagascar
+1057234135: Indonesia
+1057553687: Brazil
+436703955: France
+2549492260: United States of America
+1318212360: Colombia
+2848506530: China
+1321575246: Guatemala
+3005727173: United States of America
+1321995468: China
+1322817564: United States of America
+1675999637: United States of America
+1259191398: Panama
+1257819684: Russian Federation
+2430551392: United States of America
+2989927287: United States of America
+1455174725: Canada
+2284154890: United States of America
+2900461439: United States of America
+2236031445: United States of America
+1843567618: United States of America
+1675973762: United States of America
+1676047230: Germany
+1318477305: United States of America
+3698756098: United States of America
+2517167632: unknown or invalid
+2513883304: Indonesia
+2516696767: unknown or invalid
+2513656631: unknown or invalid
+2513761556: Indonesia
+2514561032: Indonesia
+2517198083: United States of America
+2516271128: Brazil
+2516743780: France
+2805014861: Germany
+144854485: Germany
+144838164: Austria
+864909330: Cuba
+144902385: Germany
+144838161: Austria
+1987004085: United States of America
+1986885681: United States of America
+2512804326: United States of America
+1322958063: Haiti
+2512759946: United States of America
+3696559994: United States of America
+1056014429: Thailand
+436716407: France
+1563221199: Tajikistan
+2512761763: Canada
+1321333954: United States of America
+1931060382: United States of America
+1929796452: United States of America
+1928452768: United States of America
+1930778822: Cuba
+1929535381: United States of America
+2565969320: Italy
+3392108313: Italy
+2562090317: Italy
+1852124166: Panama
+2236057326: Mexico
+1702754520: United States of America
+3043554903: India
+1322398916: United States of America
+1318373170: Haiti
+3357284466: United States of America
+2236142683: Costa Rica
+1701792313: Norway
+1701852084: Norway
+1702340583: Norway
+1702324083: Norway
+1702233404: Norway
+900324877: Norway
+1701892850: Sweden
+1701783462: Norway
+1701688454: Norway
+1702326021: Norway
+1702040261: Portugal
+1702265667: Norway
+2867598821: Norway
+1927910520: United States of America
+1929163462: Puerto Rico
+1930449245: Haiti
+1928350945: United States of America
+1929317363: United States of America
+2251702912: Mexico
+1930924401: United States of America
+2235396749: Malaysia
+1929252363: United States of America
+1927995752: United States of America
+438294181: France
+438824413: Algeria
+439294172: Madagascar
+667355588: Uruguay
+474752652: New Caledonia
+437804966: Italy
+437667776: Viet Nam
+437016988: Brazil
+437498688: Peru
+437205236: France
+438972235: Martinique
+1936123239: Germany
+1935990933: Palestine, State of
+1320460457: Haiti
+1702851818: Costa Rica
+1675788663: United States of America
+1456419962: United States of America
+3340047855: United States of America
+1927799942: Mexico
+1930203372: United States of America
+1929034867: Brazil
+2012771368: Costa Rica
+1930177611: Malaysia
+1928904177: United States of America
+1928276301: Colombia
+1929455203: Brazil
+1930554353: United States of America
+1928034398: United States of America
+1929153673: Jamaica
+1930323339: United States of America
+2513816674: unknown or invalid
+2514046501: Malaysia
+2514666766: Netherlands
+2516864891: Netherlands
+2513843663: Brazil
+2514563201: Spain
+2513556994: Argentina
+2517073037: Suriname
+2513548374: Paraguay
+2513105864: Brazil
+2514597432: Indonesia
+2513686671: Congo, Democratic Republic of the
+2513064820: Hungary
+2516564502: United Kingdom of Great Britain and Northern Ireland
+2513517294: United States of America
+2515481887: Papua New Guinea
+2513028120: unknown or invalid
+2516037063: Algeria
+2514219426: unknown or invalid
+3020071550: Estonia
+3020078501: Estonia
+3019979401: Estonia
+1930839217: Brazil
+1991428478: Mexico
+1928506645: United States of America
+1930481189: United States of America
+1981501278: Mexico
+1928908236: Guyana
+3503204182: United States of America
+1998729261: United States of America
+1998968996: United States of America
+2425445076: United States of America
+1998882868: United States of America
+1999105588: United States of America
+1998590947: United States of America
+1998818359: United States of America
+1998308840: United States of America
+1998543749: United States of America
+1998975109: United States of America
+1999410499: United States of America
+2425404585: Canada
+1999314245: United States of America
+2573155691: Australia
+2573161023: Russian Federation
+2572960903: United States of America
+2573054178: United States of America
+2513843779: Indonesia
+2516625469: Indonesia
+2516858404: Thailand
+2513512927: United States of America
+2514401720: unknown or invalid
+2514946920: Netherlands
+2515816966: Greece
+2515829688: Philippines
+2514854487: Netherlands
+2517498400: Cameroon
+2005528821: Germany
+2005593775: United States of America
+2005539155: United States of America
+2005533779: United States of America
+1146138679: United States of America
+1989533791: United States of America
+1989181820: United States of America
+1989454858: Mexico
+2236018163: Indonesia
+1317226844: Philippines
+1318393575: Colombia
+1319648357: Virgin Islands (U.S.)
+1676047656: Mexico
+1317746297: Mexico
+1319811635: United States of America
+2426886454: United States of America
+2452230713: Türkiye
+1320398138: Ecuador
+1321991154: United States of America
+1426052966: United States of America
+2234495456: United States of America
+1257613902: Philippines
+3342876306: United States of America
+1804372890: Canada
+2265509546: United States of America
+2265485412: United States of America
+2265588715: United States of America
+2265566552: United States of America
+1019752371: France
+439633342: France
+1927886612: United States of America
+1930277693: Cuba
+1929084979: Honduras
+1931220496: United States of America
+1929900918: Costa Rica
+1928678486: United States of America
+1930979681: Mexico
+1426166249: United States of America
+2981274251: Peru
+2981282666: Bolivia (Plurinational State of)
+2236950433: United States of America
+1998843424: United States of America
+1998543588: United States of America
+2425435971: United States of America
+1999112557: Costa Rica
+1998836464: United States of America
+2446819762: United States of America
+1999401722: United States of America
+2575086103: United States of America
+1998565761: United States of America
+1999418228: United States of America
+1999130856: United States of America
+3341239321: United States of America
+3341248414: United States of America
+3341257544: United States of America
+1988903463: United States of America
+1989179393: United States of America
+1989439019: United States of America
+1988751949: India
+1989036412: United States of America
+1989326782: United States of America
+3060557324: Philippines
+1091087924: United States of America
+1091063773: United States of America
+2418868230: Australia
+2573469113: United States of America
+2573544171: United States of America
+2573330291: United States of America
+2573328013: United States of America
+912507071: Brazil
+912540960: Spain
+912565425: France
+912570867: Italy
+1258000260: United States of America
+2235847388: United States of America
+1675955089: United States of America
+1456376042: United States of America
+1563240212: Peru
+2592238071: United States of America
+1322630109: United States of America
+2848440467: United States of America
+1503173489: United States of America
+1931419278: United States of America
+2514177451: Guatemala
+2516011819: unknown or invalid
+2514770172: Netherlands
+2517266770: Netherlands
+2512966787: Poland
+2513883147: Indonesia
+2513913908: Indonesia
+2514708896: Netherlands
+1702827727: Mexico
+2625873744: United States of America
+1563197526: United States of America
+1318345053: United States of America
+1318724267: United States of America
+1321992876: Mexico
+2397724211: Bolivia (Plurinational State of)
+1317726996: United States of America
+2425437928: United States of America
+1998684205: United States of America
+1999110359: United States of America
+1998611243: United States of America
+1999032409: United States of America
+1998322454: United States of America
+1998540182: United States of America
+3703056529: Canada
+1998467065: United States of America
+1998686142: United States of America
+2446828826: United States of America
+1998387136: United States of America
+1998808952: United States of America
+1999236210: Haiti
+2270190585: United States of America
+1324416944: Norway
+1324415808: Norway
+2436799917: Australia
+1144117269: Finland
+1144117930: Norway
+1144117467: Norway
+1144116842: Norway
+1144116297: Norway
+1144113124: Norway
+1144111255: Norway
+1144111060: Norway
+1144111291: Norway
+1699821443: South Africa
+1699819061: Philippines
+1988899873: United States of America
+1989203918: Indonesia
+1989452156: United States of America
+1988762889: United States of America
+1989065871: United States of America
+1989357976: Canada
+1988850469: United States of America
+2575039168: United States of America
+1999154508: United States of America
+1999365236: United States of America
+1999217275: United States of America
+1998934839: United States of America
+1998646322: United States of America
+1998370401: United States of America
+3392122454: Canada
+1998927495: United States of America
+3385632342: Mexico
+1998358368: Mexico
+2012884607: United States of America
+1998980522: United States of America
+1978791988: Canada
+2516441644: unknown or invalid
+2513632425: Algeria
+2513512357: United States of America
+2516367754: Indonesia
+2516648377: China
+2517169218: unknown or invalid
+2514512541: Philippines
+2516266693: United States of America
+2516133079: South Africa
+2514718776: Netherlands
+437963061: China
+438953416: Sudan
+667477358: Tunisia
+439392092: Brazil
+438478181: France
+731352746: Türkiye
+437268052: Cambodia
+694333780: Mauritania
+438057885: New Caledonia
+439102167: Brazil
+607734568: Viet Nam
+438658471: Iran (Islamic Republic of)
+3053337231: United States of America
+3053440922: United States of America
+3053442572: United States of America
+2830034412: Australia
+1840286477: Congo, Democratic Republic of the
+1840055092: Congo, Democratic Republic of the
+2985561385: Russian Federation
+575063194: South Africa
+575124657: Iran (Islamic Republic of)
+575349632: Israel
+684506968: France
+474857008: Paraguay
+607743206: New Caledonia
+439392892: Algeria
+437895938: Brazil
+438835964: Côte d’Ivoire
+1843863678: United States of America
+2517468164: Brazil
+2514598419: Cameroon
+2515398418: unknown or invalid
+2516437739: unknown or invalid
+2517885479: unknown or invalid
+2516041985: South Africa
+2516118664: Indonesia
+2517843127: Suriname
+2515897913: unknown or invalid
+3413669029: Türkiye
+1840413408: Tanzania, United Republic of
+575194266: Israel
+575233549: Indonesia
+1563210251: United States of America
+1675861401: France
+2426917844: Colombia
+2284150251: United States of America
+436811558: Côte d’Ivoire
+436830234: Colombia
+462516430: Madagascar
+667740521: Bolivia (Plurinational State of)
+1936123523: Russian Federation
+1936025558: Spain
+1935972158: Spain
+3469863708: United States of America
+1991424996: Costa Rica
+1930993286: United States of America
+1930741763: United States of America
+1928664304: Puerto Rico
+1931285394: Cuba
+1928825645: United States of America
+1930001164: United States of America
+1931101328: Canada
+1949826956: United States of America
+1931004823: United States of America
+1928495633: United States of America
+1262195198: United States of America
+1424769351: Canada
+2612114399: Bolivia (Plurinational State of)
+1228454849: United States of America
+1228254314: Peru
+1228209830: Brazil
+439453353: Sierra Leone
+439330078: Madagascar
+437638258: Viet Nam
+437538294: Côte d’Ivoire
+437375295: Viet Nam
+1269651682: United States of America
+3469894549: United States of America
+3469965760: United States of America
+3470019601: United States of America
+3469982849: United States of America
+1269720741: Mexico
+3469909240: United States of America
+1989305257: United States of America
+1988621268: United States of America
+1988896980: United States of America
+1989188008: Somalia
+1989449647: United States of America
+1988742851: United States of America
+1146376618: United States of America
+1852143901: Colombia
+2848499425: United States of America
+1456250583: United States of America
+2512789170: United States of America
+1318334082: Dominican Republic
+1563253027: United States of America
+2549483800: Canada
+2595756978: United States of America
+1990824315: United States of America
+1990825865: United States of America
+2595747531: United States of America
+2236597761: United States of America
+1990809725: United States of America
+1228491932: United States of America
+2235570434: Bahamas
+1228315937: United States of America
+1424774269: United States of America
+1228251414: United States of America
+1228306029: United States of America
+1928095277: United States of America
+1930650448: United States of America
+1928522992: United States of America
+1929586920: United States of America
+1930498220: United States of America
+1929215495: United States of America
+1928249367: United States of America
+1930585001: United States of America
+1931007684: United States of America
+1929709919: Bolivia (Plurinational State of)
+3404307303: Türkiye
+2597547672: Spain
+3356432307: Iran (Islamic Republic of)
+1230523158: Angola
+1230475863: Austria
+1230521660: Peru
+1319326174: United States of America
+2452332496: Puerto Rico
+1456402683: Bulgaria
+1317840733: United States of America
+2234286160: United States of America
+1638361821: Sudan
+1563345611: United States of America
+1319596167: United States of America
+2235846679: Brazil
+1320104750: Panama
+1990003078: United States of America
+2451547838: United States of America
+3709798405: United States of America
+3709961181: United States of America
+2236156450: United States of America
+1318897823: United States of America
+2236024256: Ecuador
+2236116492: United States of America
+1839875557: Congo, Democratic Republic of the
+1839435764: Congo, Democratic Republic of the
+1840238220: Congo, Democratic Republic of the
+1260899329: Peru
+1839619660: Belgium
+2243287836: Congo, Democratic Republic of the
+1839372103: Brazil
+1839431691: Congo, Democratic Republic of the
+1990250633: United States of America
+438991779: Mali
+438985152: Côte d’Ivoire
+437640137: Brazil
+438313505: United States of America
+437160969: New Caledonia
+437955154: Madagascar
+2269030207: French Polynesia
+2848503382: United States of America
+2452293722: United States of America
+1322253698: United States of America
+1318062184: France
+2516594183: Switzerland
+2513070303: United States of America
+2514386394: United States of America
+2516664127: Netherlands
+2515812343: unknown or invalid
+2514129946: Spain
+2516447912: Netherlands
+2515533625: Netherlands
+2513966458: Philippines
+2514695932: unknown or invalid
+2515524620: United States of America
+2513652595: unknown or invalid
+2516337873: Fiji
+2516850832: unknown or invalid
+2513051709: France
+2516684995: Netherlands
+2515661641: Congo, Democratic Republic of the
+1424540219: Tunisia
+1929883118: United States of America
+1931242159: United States of America
+1937515283: United States of America
+1930892685: Cuba
+1928444218: United States of America
+1930777125: United States of America
+1929531338: United States of America
+1928246346: Peru
+1930031321: United States of America
+1928816120: Brazil
+2884115305: United States of America
+1929339638: Canada
+1928097508: Canada
+1930341154: Cuba
+1927787608: Antigua and Barbuda
+1928722299: Spain
+1675914558: Sweden
+1702837908: China
+2397721138: United States of America
+3005768358: United States of America
+2643351883: United States of America
+2452246731: United States of America
+2512992091: unknown or invalid
+2513976485: Canada
+2516495194: unknown or invalid
+2517392950: Indonesia
+2513611146: Switzerland
+2514509647: Papua New Guinea
+2516970560: unknown or invalid
+2517893930: unknown or invalid
+2515754360: unknown or invalid
+2514847693: Netherlands
+2512991486: Netherlands
+437201612: Viet Nam
+438825081: Brazil
+438974427: China
+437492281: Central African Republic
+694707666: Brazil
+2243247480: Belgium
+1839457423: Congo, Democratic Republic of the
+1839899251: Congo, Democratic Republic of the
+2651317899: United States of America
+2807245356: United States of America
+474761544: France
+474815805: Brazil
+437420788: Bolivia (Plurinational State of)
+437378615: Algeria
+438211454: Cuba
+438902394: Chile
+437612824: Italy
+437656942: South Africa
+437868891: Bolivia (Plurinational State of)
+439310574: France
+2430261821: United States of America
+1258695135: United States of America
+1258765954: Mexico
+3016516488: Estonia
+3016541682: Estonia
+3016568853: Estonia
+3016596549: Latvia
+439395736: United States of America
+1839476976: Mexico
+2243240526: Congo, Democratic Republic of the
+1839421989: Peru
+1839459560: Congo, Democratic Republic of the
+2430118223: United States of America
+2430412350: United States of America
+3005774322: United States of America
+2284285249: Haiti
+2512764119: Spain
+3042814259: Russian Federation
+2514810679: Netherlands
+2514056183: Malaysia
+2516750642: Netherlands
+2513614211: United States of America
+2517250185: Indonesia
+2514618329: unknown or invalid
+2513506176: unknown or invalid
+2517302912: Bonaire, Sint Eustatius and Saba
+2513520338: unknown or invalid
+2562055936: Russian Federation
+1322099762: Haiti
+3467359925: France
+2284359495: United States of America
+2397716929: Costa Rica
+1260578109: Mexico
+3125034214: United States of America
+3356836015: United States of America
+1998387245: United States of America
+1999237355: United States of America
+1998956483: Canada
+3356834058: Canada
+1998394637: United States of America
+1999253468: United States of America
+3734813368: United States of America
+3111515383: United States of America
+2234232983: Samoa
+1999262308: United States of America
+1998974785: United States of America
+1998693120: United States of America
+1701973524: Norway
+1702263060: Norway
+1702235740: Sweden
+1702357013: Sweden
+1702425560: Norway
+1702254438: Sweden
+1701477809: Norway
+2867548082: Norway
+1701706120: Norway
+1702237871: Sweden
+1702359234: Norway
+1701896120: Norway
+1702311724: Norway
+3115500317: Fiji
+2900455868: United States of America
+1675796841: French Polynesia
+1563299852: Brazil
+2512801142: United States of America
+3005755518: Guatemala
+1455960532: Brazil
+2612111434: Mexico
+1228212556: United States of America
+1424774292: United States of America
+2612115591: Peru
+1228276754: United States of America
+1228476543: Brazil
+1228576989: United States of America
+1823454475: Bolivia (Plurinational State of)
+1701729979: United States of America
+1928881910: United States of America
+1931047540: United States of America
+1929881359: United States of America
+1928479020: Cuba
+1930924863: Puerto Rico
+1929472083: United States of America
+2447314594: United States of America
+1930491842: United States of America
+1929372145: Puerto Rico
+1928581487: United States of America
+1949839092: United States of America
+1930729559: United States of America
+2597502209: Hong Kong
+1990707954: United States of America
+1500125084: Honduras
+1500170486: United States of America
+2513603274: Indonesia
+2514462780: Israel
+2514481577: Colombia
+2517671288: unknown or invalid
+2516176063: unknown or invalid
+2513046013: Cameroon
+2517838221: Switzerland
+2516233977: Indonesia
+2514398234: Croatia
+2517101413: Sweden
+2515134729: Germany
+2514464378: Indonesia
+2515744959: France
+2514086404: Indonesia
+2514242786: Solomon Islands
+2517240779: Türkiye
+2515006045: Indonesia
+2516807990: Netherlands
+2516660906: Netherlands
diff --git a/nbs/data/gt-labels/taxon_gt.txt b/nbs/data/gt-labels/taxon_gt.txt
new file mode 100644
index 0000000..a94a64f
--- /dev/null
+++ b/nbs/data/gt-labels/taxon_gt.txt
@@ -0,0 +1,1008 @@
+1697659851: Euphrasia officinalis
+2573258025: Bryoerythrophyllum recurvirostrum
+2597666444: Carduus tenuiflorus
+1931288980: Agoseris parviflora
+1930241969: Spiraea canescens
+1929944910: Chylismia scapoidea
+1931007576: Carex typhina
+1928514234: Stachys hispida
+1928658806: Solanum donianum
+1931124118: Suaeda nigra
+1929752296: Dryopteris intermedia
+2562899020: Spiranthes ochroleuca
+1931255575: Rosa woodsii
+1929858478: Carex folliculata
+1937505702: Pteridium aquilinum
+474656434: Halgania cyanea
+1265505301: Neea floribunda
+1265483891: Mendoncia coccinea
+3416707560: Platygyrium repens
+3416740305: Pseudoleskea saviana
+1426171668: Albizia niopoides
+1802583431: Jasione humilis
+2512855384: Clematis ladakhiana
+1318293083: Cordia sulcata
+3005670412: Oplismenus burmanni
+1318526260: Tripsacum maizar
+1802569032: Lobelia flexuosa
+1456345670: Solidago stricta
+1998333126: Philadelphus coulteri
+1998550976: Apocynum floribundum
+1998969928: Spiraea salicifolia
+1998473911: Persicaria sagittata
+2425414867: Goodyera oblongifolia
+1999314904: Asplenium trichomanes
+1998413329: Poa palustris
+1999026558: Inga flexuosa
+1998316723: Porella platyphylla
+1999167579: Pseudoziziphus parryi
+3356803607: Vicia americana
+2575053354: Sambucus racemosa
+1999311542: Arisaema triphyllum
+1999283271: Panicum clandestinum
+3459889344: Coelogyne xyrekes
+1999006043: Carex stricta
+1998497875: Rhus ovata
+1998722787: Paraleucobryum longifolium
+1999143240: Antennaria neglecta
+2608680770: Liatris spicata
+1999056693: Actaea rubra
+2608673843: Viburnum nudum
+1998571450: Viola primulifolia
+1998994775: Stellaria media
+1998482052: Silene dioica
+1999328636: Dactylorhiza viridis
+731408891: Quercus suber
+438153065: Handroanthus impetiginosum
+437056001: Senna multiglandulosa
+437693558: Brimeura amethystina
+1212575663: Rumex crispus
+1212567865: Tigridia pavonia
+438120086: Datura stramonium
+438645471: Viola stipularis
+438118888: Adenia cissampeloides
+437639743: Hesperocodon hederaceus
+437308856: Pauridiantha paucinervis
+438633225: Senecio puberulus
+439286989: Elaeis guineensis
+474921033: Besleria melancholica
+667499366: Silene nutans
+437356552: Dehaasia cuneata
+437659994: Passiflora sexflora
+437656118: Betula alleghaniensis
+436990042: Stelis ruprechtiana
+438202601: Isochilus linearis
+438582009: Solanum bahamense
+438299177: Justicia mediocris
+437448502: Sphaeranthus senegalensis
+3334581963: Poa arctica
+2625898343: Xerophyllum asphodeloides
+1563245392: Artemisia lactiflora
+2426921679: Arceuthobium americanum
+2236147388: Arctostaphylos crustacea
+1675972550: Cyperus scariosus
+2452262576: Stellaria cuspidata
+1802596511: Apodanthera undulata
+1563323313: Pilosella hyperborea
+2849254057: Anemone sylvestris
+1457812021: Ilex opaca
+2610882325: Sigesbeckia orientalis
+1456008930: Stevia serrata
+2571435846: Acaena cylindristachya
+1321842019: Marattia alata
+2235750047: Miconia calvescens
+2900445104: Agrostis scabra
+1456276626: Solidago altissima
+3467360175: Rhizomnium punctatum
+1317278320: Sidalcea asprella
+2421752896: Solanum mauritianum
+1805297621: Tritomaria exsectiformis
+1805431168: Scorpidium revolvens
+1039025105: Chenopodium hybridum
+1038991156: Eriogonum brachypodum
+1805292273: Thysananthus auriculatus
+1038924603: Eriophorum tenellum
+1805298355: Frullania eboracensis
+1805292635: Haplohymenium triste
+1038926232: Solidago patula
+1805440196: Sphagnum subnitens
+1038967579: Suaeda linearis
+1038933447: Nuttallanthus canadensis
+2900436116: Agrostis stolonifera
+2565407235: Crataegus mollis
+2235813242: Physalis glutinosa
+2625851024: Potamogeton gramineus
+2592277723: Nectandra lineata
+1675940934: Polystichum lehmannii
+1321443340: Pteris semipinnata
+2643353998: Agalinis tenella
+2284193310: Polygala verticillata
+2571504032: Fragaria moschata
+1563210464: Hieracium bifidum
+1456213805: Stenocephalum tragiaefolium
+1320488541: Serjania subdentata
+2565452643: Crataegus brainerdii
+1702847152: Carex capillaris
+1930574598: Arctostaphylos uva-ursi
+1929244776: Malvastrum corchorifolium
+1928370989: Cirsium muticum
+2423994521: Stigmatopteris longicaudata
+2573563462: Ceramium diaphanum
+2284153322: Lippia brachypoda
+2549491705: Parnassia palustris
+2397800089: Quercus rugosa
+1456001675: Bidens vulgata
+1456143688: Bidens heterosperma
+1675930631: Carex albonigra
+1319864119: Panicum itatiaiae
+1563285661: Artemisia macrocephala
+1322650194: Ctenitis nigrovenia
+1318182025: Hohenbergia antillana
+3005750161: Aristida spiciformis
+3028978025: Scrophularia subaphylla
+3028987457: Silene physocalycina
+3028997059: Hieracium murorum
+3029006515: Anastrophyllum auritum
+2514524961: Gaertnera rufinervis
+2513842767: Salix aurita
+2516762837: Fritillaria meleagris
+2513616378: Helichrysum gymnocomum
+2514667385: Peponium vogelii
+2514761846: Ammophila arenaria
+2515642096: Eumachia microdon
+2514528877: Vitex obovata
+2516640138: Ixora grandifolia
+2514723844: Oenothera biennis
+2515771511: Trifolium dubium
+2514102813: Crateva magna
+2516419324: Dactylorhiza majalis
+2517526453: Prunus tomentosa
+2513862848: Quercus benderi
+2516471936: Uncaria cordata
+2517375344: Silene latifolia
+2513600930: Panicum acuminatum
+2514362967: Eryngium balansae
+2516836918: Dendrolobium rugosum
+1056069802: Calophyllum vitiense
+1638424637: Rubus amygdalanthus
+1701811884: Carex atrata
+1702371088: Asplenium marinum
+1701599986: Ranunculus nivalis
+1702365209: Woodsia ilvensis
+1702012844: Struthiopteris spicant
+1702358167: Aquilegia vulgaris
+1702001459: Mentha arvensis
+1702352653: Helichrysum luteoalbum
+1701991397: Polygonatum verticillatum
+1701470291: Luzula multiflora
+1701976980: Linaria vulgaris
+1702438076: Sparganium emersum
+2005771863: Distylium racemosum
+1998773668: Phegopteris hexagonoptera
+1998991904: Brachythecium salebrosum
+1999413720: Potamogeton diversifolius
+1999320848: Corylus cornuta
+2446828060: Bartonia paniculata
+2012879889: Fraxinus profunda
+1998481102: Atriplex prostrata
+1999330570: Carex pellita
+1999047345: Digitaria sanguinalis
+1998758107: Cirsium vulgare
+1998465329: Actaea rubra
+1999317509: Boechera stricta
+1999028044: Clarkia xantiana
+1830992236: Neodolichomitra yunnanensis
+1135439455: Centaurea tuntasia
+3052190309: Scrophularia heterophylla
+1584383967: Berberis poiretii
+2859424505: Oenothera biennis
+2859305213: Solidago patula
+2859406708: Euphorbia nutans
+2859014941: Cladium mariscoides
+1563140142: Cheilolejeunea clausa
+2858981761: Panicum acuminatum
+2859042459: Eutrochium fistulosum
+2859205685: Solanum carolinense
+2382623542: Isocoma menziesii
+2515947653: Equisetum sylvaticum
+2515509875: Trifolium campestre
+2516458084: Sedum acre
+2512978024: Actaea europaea
+2513965497: Chenopodium quinoa
+2515252166: Cymbocarpa refracta
+2513596553: Muhlenbergia tenuifolia
+2514376754: Eryngium ebracteatum
+2516750403: Lipochaeta succulenta
+1055366369: Bidens cornuta
+1675890176: Carex subspathacea
+2235995189: Rauvolfia tetraphylla
+3092956655: Chloris virgata
+1318027385: Cuscuta gronovii
+1321746477: Miconia coniophora
+1836712931: Viburnum colebrookeanum
+1317865686: Pteris altissima
+1062492619: Asplenium megalura
+1260164220: Odontadenia semidigyna
+1424543510: Lactuca tenerrima
+1988124288: Gaultheria humifusa
+1988239866: Juniperus horizontalis
+439241145: Eleocharis tetraquetra
+1840203964: Neonotonia wightii
+2243263149: Aristida adscensionis
+1839459389: Gaertnera longivaginalis
+1840095043: Dovyalis macrocalyx
+1840405939: Zanthoxylum leprieurii
+1839928488: Newbouldia laevis
+3467294610: Syrrhopodon incompletus
+3467354375: Rhizomnium punctatum
+2625852862: Cistanche salsa
+2236176339: Rhabdodendron macrophyllum
+2452236904: Silene douglasii
+1702798423: Carex bigelowii
+1802552799: Nertera granadensis
+3001102338: Herminium monorchis
+3001176019: Agrimonia eupatoria
+3001081883: Pilosella officinarum
+3001166637: Lithospermum officinale
+3001179543: Ranunculus cassubicus
+2284326054: Mangifera indica
+1675819460: Carex normalis
+1456165342: Coreopsis lanceolata
+2549603947: Itea chinensis
+2235755905: Abuta grandifolia
+1318797445: Drymaria cordata
+2515216190: Philodendron sagittifolium
+2514533738: Cyrtandra picta
+2517047032: Plectranthus djalonensis
+2514461919: Ligusticum ferulaceum
+2514515297: Symplocos robinsonii
+2515925582: Drypetes paxii
+2515196651: Goniothalamus elmeri
+2516405868: Calophyllum andersonii
+2517108376: Penicillus lamourouxii
+2514917088: Cyrtomium fortunei
+2517171312: Cystoclonium purpureum
+2875995685: Dianthus cretaceus
+1935938475: Cistus crispus
+1935892381: Amaranthus viridis
+3091199136: Hypericum scouleri
+2807578524: Desmanthus illinoensis
+2807261388: Ipomopsis longiflora
+2807456023: Salvia reptans
+2807365108: Bothriochloa bladhii
+2807319680: Muhlenbergia paniculata
+2807521521: Hymenophyllum elegans
+1019531437: Isopterygium nivescens
+1675777378: Carex athrostachya
+2284189808: Asemeia ovata
+2549496787: Philadelphus microphyllus
+2397779786: Quercus stellata
+2284257102: Elaphoglossum erinaceum
+1319210580: Phyllanthus acidus
+3092906623: Sporobolus clandestinus
+2514607056: Verbascum nigrum
+2513624958: Hieracium cymosum
+2515905904: Schizachyrium jeffreysii
+2514332268: Thymelaea dioica
+2514380395: Shorea ovalis
+2517280563: Asplenium monanthes
+2515241297: Cerastium biebersteinii
+2516406700: Globularia bisnagarica
+2516224982: Plantago arenaria
+2515327027: Asplenium serratum
+2235965636: Pyrola asarifolia
+2625876902: Yucca baccata
+2625904355: Juncus rugulosus
+2235956175: Vaccinium erythrocarpum
+2512820352: Boechera perennans
+2235866543: Labisia pumila
+2426902656: Boerhavia torreyana
+2573212768: Walsura robusta
+1701276937: Nepeta laevigata
+1701335898: Bunias orientalis
+1701231105: Hordeum jubatum
+1701285949: Silene flos-cuculi
+1701266314: Hieracium cruentifolium
+1701236914: Thalictrum simplex
+1701287735: Vicia dumetorum
+187209281: Ophioglossum vulgatum
+1701766050: Ranunculus nivalis
+1937521898: Hypericum hypericoides
+1949855671: Drosera intermedia
+1928175542: Citharexylum spinosum
+1929143537: Spiraea tomentosa
+1930252111: Rhynchospora holoschoenoides
+1989131227: Chamaenerion angustifolium
+1989431045: Cirsium pumilum
+1988737850: Eriophorum gracile
+1989039350: Syrrhopodon incompletus
+1989313099: Scirpus atrovirens
+2274177724: Pseudolycopodiella caroliniana
+1988969066: Iris virginica
+2252181535: Pterygoneurum subsessile
+2252151414: Encalypta rhaptocarpa
+1977964375: Polystichum lonchitis
+1057464806: Psydrax schimperiana
+1057260090: Fumaria capreolata
+813379916: Wahlenbergia verbascoides
+1057532849: Cladophora columbiana
+1056306307: Angraecum rostratum
+1057234135: Calymmodon clavifer
+1057553687: Cuspidaria simplicifolia
+436703955: Logfia minima
+2549492260: Saxifraga bronchialis
+1318212360: Cyathea squamipes
+2848506530: Stipa przewalskyi
+1321575246: Pinus pseudostrobus
+3005727173: Panicum aciculare
+1321995468: Crepidomanes minutum
+1322817564: Asplenium macraei
+1675999637: Astragalus sparsiflorus
+1259191398: Appunia seibertii
+1257819684: Salix wilhelmsiana
+2430551392: Salvia farinacea
+2989927287: Ladeania lanceolata
+1455174725: Polystichum acrostichoides
+2284154890: Nemophila pulchella
+2900461439: Poa secunda
+2236031445: Elliottia pyroliflora
+1843567618: Solanum jamesii
+1675973762: Lespedeza virginica
+1676047230: Carex muricata
+1318477305: Epilobium ciliatum
+3698756098: Ilex montana
+2517167632: Ajuga genevensis
+2513883304: Artocarpus rigidus
+2516696767: Carduus pycnocephalus
+2513656631: Carex buekii
+2513761556: Molineria latifolia
+2514561032: Adenosma hirsutum
+2517198083: Mirabilis multiflora
+2516271128: Tassadia berteriana
+2516743780: Carduus carlinoides
+2805014861: Kalanchoe densiflora
+144854485: Dactylorhiza sambucina
+144838164: Artemisia umbelliformis
+864909330: Selaginella tenella
+144902385: Vaccinium intermedium
+144838161: Artemisia umbelliformis
+1987004085: Sisyrinchium idahoense
+1986885681: Poa secunda
+2512804326: Ranunculus fascicularis
+1322958063: Varronia linnaei
+2512759946: Clematis ligusticifolia
+3696559994: Bryum capillare
+1056014429: Mangifera duperreana
+436716407: Thesium alpinum
+1563221199: Sophora alopecuroides
+2512761763: Clematis occidentalis
+1321333954: Lupinus truncatus
+1931060382: Rubus allegheniensis
+1929796452: Cladophora flexuosa
+1928452768: Carex cephalophora
+1930778822: Comocladia platyphylla
+1929535381: Oxytropis campestris
+2565969320: Anemone apennina
+3392108313: Adenocarpus commutatus
+2562090317: Nigella damascena
+1852124166: Solanum splendens
+2236057326: Donnellsmithia juncea
+1702754520: Kelloggia galioides
+3043554903: Hemarthria compressa
+1322398916: Dalea floridana
+1318373170: Miconia elaeagnoides
+3357284466: Muhlenbergia richardsonis
+2236142683: Miconia silviphila
+1701792313: Linaria vulgaris
+1701852084: Epilobium palustre
+1702340583: Carex bigelowii
+1702324083: Mercurialis annua
+1702233404: Cerastium arcticum
+900324877: Jasione montana
+1701892850: Anthericum liliago
+1701783462: Polygonatum odoratum
+1701688454: Juncus triglumis
+1702326021: Dryas octopetala
+1702040261: Polystichum falcinellum
+1702265667: Impatiens noli-tangere
+2867598821: Taraxacum croceum
+1927910520: Dalea jamesii
+1929163462: Microgramma lycopodioides
+1930449245: Trichilia pallida
+1928350945: Phaseolus maculatus
+1929317363: Juncus tenuis
+2251702912: Athyrium arcuatum
+1930924401: Sporobolus alterniflorus
+2235396749: Alsophila loheri
+1929252363: Carex lupulina
+1927995752: Sceptridium dissectum
+438294181: Campanula barbata
+438824413: Lavandula saharica
+439294172: Commiphora tetramera
+667355588: Melochia pyramidata
+474752652: Rauvolfia semperflorens
+437804966: Lupinus albus
+437667776: Taxus chinensis
+437016988: Lycopodiella alopecuroides
+437498688: Cyathea dombeyi
+437205236: Cerinthe major
+438972235: Epidendrum difforme
+1936123239: Cypripedium calceolus
+1935990933: Convolvulus secundus
+1320460457: Phyllanthus amarus
+1702851818: Palicourea elata
+1675788663: Carex utriculata
+1456419962: Psilocarphus oregonus
+3340047855: Schizachyrium scoparium
+1927799942: Echinopepon insularis
+1930203372: Celtis laevigata
+1929034867: Siparuna sessiliflora
+2012771368: Hymenophyllum plumosum
+1930177611: Calamus marginatus
+1928904177: Poa interior
+1928276301: Peperomia muscipara
+1929455203: Myrcia splendens
+1930554353: Solanum dulcamara
+1928034398: Castilleja organorum
+1929153673: Tabernaemontana laurifolia
+1930323339: Symphyotrichum foliaceum
+2513816674: Phaius tankervilleae
+2514046501: Drepananthus pruniferus
+2514666766: Corynephorus canescens
+2516864891: Helosciadium inundatum
+2513843663: Piper laurifolium
+2514563201: Salvia verbenaca
+2513556994: Elionurus muticus
+2517073037: Pseudolmedia laevis
+2513548374: Calamagrostis viridiflavescens
+2513105864: Digitaria hololeuca
+2514597432: Lonicera acuminata
+2513686671: Fimbristylis dichotoma
+2513064820: Vicia grandiflora
+2516564502: Myosotis laxa
+2513517294: Andropogon floridanus
+2515481887: Gahnia aspera
+2513028120: Campanula alpina
+2516037063: Cheirolophus sempervirens
+2514219426: Clausena lansium
+3020071550: Carex chordorrhiza
+3020078501: Carex appropinquata
+3019979401: Daphne mezereum
+1930839217: Enterolobium maximum
+1991428478: Stenocereus alamosensis
+1928506645: Chondria capillaris
+1930481189: Spyridia filamentosa
+1981501278: Myriopteris gracilis
+1928908236: Ascogrammitis anfractuosa
+3503204182: Erodium cicutarium
+1998729261: Amelanchier laevis
+1998968996: Platanthera flava
+2425445076: Calopogon tuberosus
+1998882868: Carex laxiculmis
+1999105588: Scheuchzeria palustris
+1998590947: Solidago uliginosa
+1998818359: Amelanchier utahensis
+1998308840: Juncus articulatus
+1998543749: Acer negundo
+1998975109: Myriophyllum humile
+1999410499: Stellaria crassifolia
+2425404585: Lysimachia maritima
+1999314245: Carex aurea
+2573155691: Schizachyrium brevifolium
+2573161023: Corydalis solida
+2572960903: Struthiopteris spicant
+2573054178: Quercus alba
+2513843779: Piper lessertianum
+2516625469: Neonauclea excelsa
+2516858404: Garuga pinnata
+2513512927: Elymus violaceus
+2514401720: Viola rupestris
+2514946920: Picris hieracioides
+2515816966: Ruppia maritima
+2515829688: Aglaia luzoniensis
+2514854487: Lamium maculatum
+2517498400: Margaritaria discoidea
+2005528821: Brachythecium populeum
+2005593775: Kurzia sylvatica
+2005539155: Ditrichum heteromallum
+2005533779: Cirriphyllum piliferum
+1146138679: Sphagnum girgensohnii
+1989533791: Urtica gracilis
+1989181820: Sphagnum palustre
+1989454858: Eriochloa acuminata
+2236018163: Flacourtia rukam
+1317226844: Dicranopteris linearis
+1318393575: Ditaxis argothamnoides
+1319648357: Physalis angulata
+1676047656: Passiflora holosericea
+1317746297: Croton californicus
+1319811635: Asplenium platyneuron
+2426886454: Amaranthus palmeri
+2452230713: Silene cappadocica
+1320398138: Monopyle mexiae
+1321991154: Melanthera parvifolia
+1426052966: Castilleja arachnoidea
+2234495456: Symphyotrichum concolor
+1257613902: Phaius flavus
+3342876306: Yucca harrimaniae
+1804372890: Equisetum variegatum
+2265509546: Poa pratensis
+2265485412: Blephilia hirsuta
+2265588715: Hieracium longipilum
+2265566552: Cyperus retrofractus
+1019752371: Siler montanum
+439633342: Lindernia dubia
+1927886612: Salix nigra
+1930277693: Phlegmariurus linifolius
+1929084979: Peperomia heterophylla
+1931220496: Acanthospermum hispidum
+1929900918: Miconia xalapensis
+1928678486: Eleocharis flavescens
+1930979681: Pernettya prostrata
+1426166249: Dalea polygonoides
+2981274251: Calceolaria flexuosa
+2981282666: Mentzelia parvifolia
+2236950433: Acer spicatum
+1998843424: Sorbus americana
+1998543588: Pilea pumila
+2425435971: Agalinis maritima
+1999112557: Dioscorea pilosiuscula
+1998836464: Hamamelis virginiana
+2446819762: Dalea lanata
+1999401722: Carex gynandra
+2575086103: Melampyrum lineare
+1998565761: Erysimum perenne
+1999418228: Rudbeckia hirta
+1999130856: Stellaria ruscifolia
+3341239321: Solidago rugosa
+3341248414: Stellaria graminea
+3341257544: Populus nigra
+1988903463: Zizania palustris
+1989179393: Potamogeton nodosus
+1989439019: Carex communis
+1988751949: Cynoglossum wallichii
+1989036412: Juncus brevicaudatus
+1989326782: Polygonum erectum
+3060557324: Onychium siliculosum
+1091087924: Rhizomnium glabrescens
+1091063773: Frullania asagrayana
+2418868230: Eragrostis pubescens
+2573469113: Viola macloskeyi
+2573544171: Berberis fendleri
+2573330291: Salix amygdaloides
+2573328013: Erigeron compositus
+912507071: Pradosia cochlearia
+912540960: Sisymbrium assoanum
+912565425: Rosa agrestis
+912570867: Rubus uncinatus
+1258000260: Castanea ozarkensis
+2235847388: Styrax americanus
+1675955089: Carex tumulicola
+1456376042: Artemisia ludoviciana
+1563240212: Inga feuillei
+2592238071: Lemna trisulca
+1322630109: Tilia americana
+2848440467: Lolium perenne
+1503173489: Diphasiastrum sitchense
+1931419278: Festuca vivipara
+2514177451: Hirtella americana
+2516011819: Zephyranthes rosea
+2514770172: Veronica scutellata
+2517266770: Lathyrus pratensis
+2512966787: Alchemilla monticola
+2513883147: Artocarpus rigidus
+2513913908: Ficus ribes
+2514708896: Lolium multiflorum
+1702827727: Chiococca alba
+2625873744: Maianthemum stellatum
+1563197526: Acaciella angustissima
+1318345053: Carya glabra
+1318724267: Viburnum prunifolium
+1321992876: Frankenia palmeri
+2397724211: Alnus acuminata
+1317726996: Clermontia persicifolia
+2425437928: Cuscuta gronovii
+1998684205: Tayloria splachnoides
+1999110359: Xyris difformis
+1998611243: Crataegus iracunda
+1999032409: Scirpus pedicellatus
+1998322454: Gaylussacia baccata
+1998540182: Agrostis stolonifera
+3703056529: Chenopodium album
+1998467065: Chimaphila menziesii
+1998686142: Calamagrostis canadensis
+2446828826: Dalea enneandra
+1998387136: Symphyotrichum pilosum
+1998808952: Crataegus pruinosa
+1999236210: Carex ekmanii
+2270190585: Asclepias cordifolia
+1324416944: Plagiomnium medium
+1324415808: Cirriphyllum piliferum
+2436799917: Neuropoa fax
+1144117269: Campanula patula
+1144117930: Jasione montana
+1144117467: Campanula rotundifolia
+1144116842: Succisa pratensis
+1144116297: Valeriana excelsa
+1144113124: Veronica serpyllifolia
+1144111255: Verbascum thapsus
+1144111060: Verbascum nigrum
+1144111291: Veronica agrestis
+1699821443: Lotononis stricta
+1699819061: Dimetia capitellata
+1988899873: Ulva intestinalis
+1989203918: Pneumatopteris inclusa
+1989452156: Carex glaucodea
+1988762889: Rudbeckia hirta
+1989065871: Agrostis hyemalis
+1989357976: Drepanocladus polygamus
+1988850469: Sphagnum recurvum
+2575039168: Ulmus crassifolia
+1999154508: Crataegus uniflora
+1999365236: Stellaria borealis
+1999217275: Crepis acuminata
+1998934839: Bryoxiphium norvegicum
+1998646322: Heuchera pilosissima
+1998370401: Crataegus schuettei
+3392122454: Persicaria maculosa
+1998927495: Polygonum ramosissimum
+3385632342: Prosthechea ochracea
+1998358368: Euploca ternata
+2012884607: Halerpestes cymbalaria
+1998980522: Crataegus pruinosa
+1978791988: Bistorta vivipara
+2516441644: Pedicularis cenisia
+2513632425: Rostraria hispida
+2513512357: Elymus spicatus
+2516367754: Calceolaria tripartita
+2516648377: Salvia japonica
+2517169218: Swartzia racemosa
+2514512541: Ardisia purpurea
+2516266693: Gentianella quinquefolia
+2516133079: Erica filiformis
+2514718776: Hypericum elodes
+437963061: Dalbergia hupeana
+438953416: Blumea lacera
+667477358: Silene apetala
+439392092: Machaerium stipitatum
+438478181: Berberis vulgaris
+731352746: Trigonella cariensis
+437268052: Indigofera aralensis
+694333780: Lawsonia inermis
+438057885: Uraria lagopodioides
+439102167: Stenocephalum megapotamicum
+607734568: Paederia foetida
+438658471: Astragalus keratensis
+3053337231: Viola sheltonii
+3053440922: Viola adunca
+3053442572: Trifolium depauperatum
+2830034412: Leptospermum polygalifolium
+1840286477: Bertiera thonneri
+1840055092: Tetracera alnifolia
+2985561385: Gladiolus tenuis
+575063194: Oxalis multicaulis
+575124657: Ranunculus sahendicus
+575349632: Salix babylonica
+684506968: Athyrium filix-femina
+474857008: Hydrolea spinosa
+607743206: Ixora kuakuensis
+439392892: Vitis vinifera
+437895938: Bidens rubifolia
+438835964: Chlorophytum alismifolium
+1843863678: Moricandia arvensis
+2517468164: Swartzia ulei
+2514598419: Psychotria vogeliana
+2515398418: Pallenis spinosa
+2516437739: Viola palustris
+2517885479: Tilia moltkei
+2516041985: Hebenstretia neglecta
+2516118664: Perrottetia alpestris
+2517843127: Parkia nitida
+2515897913: Echium creticum
+3413669029: Astragalus physocalyx
+1840413408: Psychotria goetzei
+575194266: Calendula arvensis
+575233549: Cyrtandra megalocrater
+1563210251: Trifolium bifidum
+1675861401: Carex strigosa
+2426917844: Dysphania ambrosioides
+2284150251: Polygonum douglasii
+436811558: Porotrichum elongatum
+436830234: Plagiothecium lucidum
+462516430: Halimeda macroloba
+667740521: Mielichhoferia clavitheca
+1936123523: Lathyrus japonicus
+1936025558: Equisetum ramosissimum
+1935972158: Gentiana pneumonanthe
+3469863708: Heterotheca fastigiata
+1991424996: Tillandsia leiboldiana
+1930993286: Pedicularis procera
+1930741763: Bromus commutatus
+1928664304: Spigelia anthelmia
+1931285394: Psychotria costivenia
+1928825645: Isocoma pluriflora
+1930001164: Juncus ensifolius
+1931101328: Carex atherodes
+1949826956: Boehmeria cylindrica
+1931004823: Pediomelum tenuiflorum
+1928495633: Astragalus miser
+1262195198: Poa juncifolia
+1424769351: Carex trichocarpa
+2612114399: Chloroleucon tenuiflorum
+1228454849: Solidago rigida
+1228254314: Miconia eriocalyx
+1228209830: Croton cordiifolius
+439453353: Euphorbia hirta
+439330078: Gnidia linearis
+437638258: Ludisia discolor
+437538294: Celtis mildbraedii
+437375295: Eleocharis dulcis
+1269651682: Ranunculus fascicularis
+3469894549: Baptisia alba
+3469965760: Solidago fistulosa
+3470019601: Castilleja miniata
+3469982849: Sagina decumbens
+1269720741: Seymeria deflexa
+3469909240: Stachys pycnantha
+1989305257: Callicladium haldanianum
+1988621268: Chara braunii
+1988896980: Racomitrium microcarpum
+1989188008: Hypnea musciformis
+1989449647: Coelastrum cambricum
+1988742851: Porella platyphylla
+1146376618: Moneses uniflora
+1852143901: Sanchezia parvibracteata
+2848499425: Agrostis perennans
+1456250583: Helenium puberulum
+2512789170: Menispermum canadense
+1318334082: Pilea inaequalis
+1563253027: Packera streptanthifolia
+2549483800: Ribes hudsonianum
+2595756978: Cyperus odoratus
+1990824315: Lygodesmia juncea
+1990825865: Rudbeckia triloba
+2595747531: Carex barrattii
+2236597761: Centaurium pulchellum
+1990809725: Gentiana saponaria
+1228491932: Juncus interior
+2235570434: Croton linearis
+1228315937: Carex pellita
+1424774269: Persicaria sagittata
+1228251414: Salix eriocephala
+1228306029: Juncus lesueurii
+1928095277: Liatris elegans
+1930650448: Panicum portoricense
+1928522992: Eleocharis equisetoides
+1929586920: Triplasis purpurea
+1930498220: Ericameria nauseosa
+1929215495: Myriopteris gracilis
+1928249367: Mirabilis multiflora
+1930585001: Symphyotrichum novi-belgii
+1931007684: Callithamnion tetragonum
+1929709919: Herbertus serratus
+3404307303: Dianthus strictus
+2597547672: Clinopodium nepeta
+3356432307: Dianthus orientalis
+1230523158: Crotalaria densicephala
+1230475863: Astragalus austriacus
+1230521660: Clusia trochiformis
+1319326174: Asclepias subverticillata
+2452332496: Anredera vesicaria
+1456402683: Leontodon hispidus
+1317840733: Elymus hystrix
+2234286160: Taraxacum erythrospermum
+1638361821: Barleria hochstetteri
+1563345611: Ambrosia artemisiifolia
+1319596167: Xanthium strumarium
+2235846679: Ocotea brachybotra
+1320104750: Sida jussiaeana
+1990003078: Philadelphus hirsutus
+2451547838: Baileya pauciradiata
+3709798405: Carex aestivalis
+3709961181: Clethra alnifolia
+2236156450: Comarostaphylis diversifolia
+1318897823: Adoxa moschatellina
+2236024256: Chaetogastra mollis
+2236116492: Calystegia sepium
+1839875557: Dialium pachyphyllum
+1839435764: Cenchrus unisetus
+1840238220: Bertiera naucleoides
+1260899329: Elaphoglossum petiolatum
+1839619660: Arabidopsis arenosa
+2243287836: Eragrostis cylindriflora
+1839372103: Gaylussacia pallida
+1839431691: Dischistocalyx thunbergiiflora
+1990250633: Phelipanche ramosa
+438991779: Brachiaria lata
+438985152: Diospyros gabunensis
+437640137: Solanum subinerme
+438313505: Carex crinita
+437160969: Myrtopsis pomaderridifolia
+437955154: Pachypodium lamerei
+2269030207: Arachniodes aristata
+2848503382: Vulpia myuros
+2452293722: Ranunculus hispidus
+1322253698: Embelia pacifica
+1318062184: Ludwigia peploides
+2516594183: Odontites luteus
+2513070303: Epilobium brachycarpum
+2514386394: Malvella lepidota
+2516664127: Legousia hybrida
+2515812343: Phyllanthus urinaria
+2514129946: Poterium verrucosum
+2516447912: Lipandra polysperma
+2515533625: Salix triandra
+2513966458: Aerva sanguinolenta
+2514695932: Carduus personata
+2515524620: Sicyos laciniatus
+2513652595: Triticum turgidum
+2516337873: Planchonella garberi
+2516850832: Cerastium uniflorum
+2513051709: Sulla spinosissima
+2516684995: Valeriana officinalis
+2515661641: Cyperus distans
+1424540219: Andryala nigricans
+1929883118: Symphyotrichum ontarionis
+1931242159: Oenothera calcicola
+1937515283: Pteridium aquilinum
+1930892685: Picramnia pentandra
+1928444218: Holcus lanatus
+1930777125: Agalinis divaricata
+1929531338: Carex praeceptorum
+1928246346: Brachyotum nutans
+1930031321: Prunella vulgaris
+1928816120: Coccocypselum capitatum
+2884115305: Weissia leiodonta
+1929339638: Erigeron acris
+1928097508: Carex leptalea
+1930341154: Cyclodictyon varians
+1927787608: Asplenium pumilum
+1928722299: Ulva compressa
+1675914558: Carex holostoma
+1702837908: Paederia foetida
+2397721138: Quercus stellata
+3005768358: Panicum stapfianum
+2643351883: Aphyllon uniflorum
+2452246731: Claytonia rubra
+2512992091: Dryopteris villarii
+2513976485: Axyris amaranthoides
+2516495194: Campanula persicifolia
+2517392950: Weinmannia fraxinea
+2513611146: Pilosella blyttiana
+2514509647: Myrsine lamii
+2516970560: Jacobaea leucophylla
+2517893930: Digitaria gymnostachys
+2515754360: Agathosma bisulca
+2514847693: Lepidium densiflorum
+2512991486: Dryopteris filix-mas
+437201612: Carya tonkinensis
+438825081: Goniopteris platypes
+438974427: Ranunculus pulchellus
+437492281: Aspilia sahariensis
+694707666: Wissadula parviflora
+2243247480: Hypericum tetrapterum
+1839457423: Nervilia bicarinata
+1839899251: Solanum melongena
+2651317899: Apocynum cannabinum
+2807245356: Selaginella pilifera
+474761544: Odontites vulgaris
+474815805: Jacquemontia blanchetii
+437420788: Senecio nutans
+437378615: Aegilops ventricosa
+438211454: Grisebachianthus plucheoides
+438902394: Haplopappus velutinus
+437612824: Cynoglossum nebrodense
+437656942: Euphorbia striata
+437868891: Liparis neuroglossa
+439310574: Lobelia urens
+2430261821: Eriogonum parvifolium
+1258695135: Astragalus simplicifolius
+1258765954: Brickellia monocephala
+3016516488: Pilosella caespitosa
+3016541682: Prunus tenella
+3016568853: Carex brunnescens
+3016596549: Lophoziopsis excisa
+439395736: Vitis aestivalis
+1839476976: Varronia macrocephala
+2243240526: Fimbristylis dichotoma
+1839421989: Forsteronia graciloides
+1839459560: Kohautia coccinea
+2430118223: Streptanthus coulteri
+2430412350: Phyllodoce empetriformis
+3005774322: Panicum commutatum
+2284285249: Stigmaphyllon bannisterioides
+2512764119: Cochlearia anglica
+3042814259: Equisetum arvense
+2514810679: Viola palustris
+2514056183: Friesodielsia latifolia
+2516750642: Bromus hordeaceus
+2513614211: Helianthus divaricatus
+2517250185: Persicaria chinensis
+2514618329: Scrophularia divaricata
+2513506176: Groenlandia densa
+2517302912: Hymenophyllum polyanthos
+2513520338: Aristida nemorivaga
+2562055936: Cypripedium macranthos
+1322099762: Chrysophyllum oliviforme
+3467359925: Crossidium squamiferum
+2284359495: Phyla cuneifolia
+2397716929: Megalastrum atrogriseum
+1260578109: Agave spicata
+3125034214: Erigeron bonariensis
+3356836015: Vicia americana
+1998387245: Trichostema brachiatum
+1999237355: Chenopodiastrum standleyanum
+1998956483: Arctostaphylos uva-ursi
+3356834058: Juniperus communis
+1998394637: Carex scabriuscula
+1999253468: Viola odorata
+3734813368: Juncus dudleyi
+3111515383: Alloberberis trifoliolata
+2234232983: Macropiper puberulum
+1999262308: Paronychia sessiliflora
+1998974785: Castilleja applegatei
+1998693120: Antennaria plantaginifolia
+1701973524: Stellaria longifolia
+1702263060: Phyllodoce caerulea
+1702235740: Rubus grabowskii
+1702357013: Euphorbia cyparissias
+1702425560: Micranthes nivalis
+1702254438: Lipandra polysperma
+1701477809: Arabidopsis thaliana
+2867548082: Taraxacum stictophyllum
+1701706120: Carex pulicaris
+1702237871: Myrrhis odorata
+1702359234: Gentianella amarella
+1701896120: Lotus corniculatus
+1702311724: Digitalis purpurea
+3115500317: Diplazium sylvaticum
+2900455868: Cinna arundinacea
+1675796841: Caesalpinia pulcherrima
+1563299852: Inga nobilis
+2512801142: Rorippa palustris
+3005755518: Paspalum decumbens
+1455960532: Tapirira obtusa
+2612111434: Euphorbia berteroana
+1228212556: Myosotis laxa
+1424774292: Persicaria sagittata
+2612115591: Baccharis auriculigera
+1228276754: Rorippa palustris
+1228476543: Anemopaegma arvense
+1228576989: Juncus nodosus
+1823454475: Gurania lobata
+1701729979: Cypripedium fasciculatum
+1928881910: Adiantum pedatum
+1931047540: Typha angustifolia
+1929881359: Acer saccharum
+1928479020: Cirsium mexicanum
+1930924863: Chaetomorpha antennina
+1929472083: Juniperus scopulorum
+2447314594: Angadenia berteroi
+1930491842: Galium trifidum
+1929372145: Opuntia repens
+1928581487: Eriogonum heermannii
+1949839092: Utricularia gibba
+1930729559: Oclemena acuminata
+2597502209: Pteris quadriaurita
+1990707954: Deparia prolifera
+1500125084: Didymoglossum godmanii
+1500170486: Asplenium viride
+2513603274: Paspalum conjugatum
+2514462780: Vitex agnus-castus
+2514481577: Bignonia aequinoctialis
+2517671288: Gratiola officinalis
+2516176063: Hypericum elodes
+2513046013: Mallotus subulatus
+2517838221: Clinopodium alpinum
+2516233977: Arytera litoralis
+2514398234: Abutilon theophrasti
+2517101413: Actinotaenium mooreanum
+2515134729: Micrasterias furcata
+2514464378: Gynotroches axillaris
+2515744959: Cistus salviifolius
+2514086404: Litsea elliptica
+2514242786: Cleidion javanicum
+2517240779: Lotus alpinus
+2515006045: Diplazium polypodioides
+2516807990: Populus alba
+2516660906: Cymbalaria muralis
diff --git a/nbs/data/resized-images/1212567865.jpg b/nbs/data/resized-images/1212567865.jpg
new file mode 100644
index 0000000..fa63280
Binary files /dev/null and b/nbs/data/resized-images/1212567865.jpg differ
diff --git a/nbs/data/resized-images/1317278320.jpg b/nbs/data/resized-images/1317278320.jpg
new file mode 100644
index 0000000..205e111
Binary files /dev/null and b/nbs/data/resized-images/1317278320.jpg differ
diff --git a/nbs/data/resized-images/1317726996.jpg b/nbs/data/resized-images/1317726996.jpg
new file mode 100644
index 0000000..17d7f79
Binary files /dev/null and b/nbs/data/resized-images/1317726996.jpg differ
diff --git a/nbs/data/resized-images/1317746297.jpg b/nbs/data/resized-images/1317746297.jpg
new file mode 100644
index 0000000..c62bd14
Binary files /dev/null and b/nbs/data/resized-images/1317746297.jpg differ
diff --git a/nbs/data/resized-images/1317840733.jpg b/nbs/data/resized-images/1317840733.jpg
new file mode 100644
index 0000000..9f625bb
Binary files /dev/null and b/nbs/data/resized-images/1317840733.jpg differ
diff --git a/nbs/data/resized-images/1318027385.jpg b/nbs/data/resized-images/1318027385.jpg
new file mode 100644
index 0000000..a462ee2
Binary files /dev/null and b/nbs/data/resized-images/1318027385.jpg differ
diff --git a/nbs/data/resized-images/1318182025.jpg b/nbs/data/resized-images/1318182025.jpg
new file mode 100644
index 0000000..5bdbcbe
Binary files /dev/null and b/nbs/data/resized-images/1318182025.jpg differ
diff --git a/nbs/data/resized-images/1318212360.jpg b/nbs/data/resized-images/1318212360.jpg
new file mode 100644
index 0000000..d2dcf0d
Binary files /dev/null and b/nbs/data/resized-images/1318212360.jpg differ
diff --git a/nbs/data/resized-images/1318293083.jpg b/nbs/data/resized-images/1318293083.jpg
new file mode 100644
index 0000000..ab037f1
Binary files /dev/null and b/nbs/data/resized-images/1318293083.jpg differ
diff --git a/nbs/data/resized-images/437160969.jpg b/nbs/data/resized-images/437160969.jpg
new file mode 100644
index 0000000..f5e50fd
Binary files /dev/null and b/nbs/data/resized-images/437160969.jpg differ
diff --git a/nbs/index.ipynb b/nbs/index.ipynb
index 0a8aa97..47de2bd 100644
--- a/nbs/index.ipynb
+++ b/nbs/index.ipynb
@@ -19,6 +19,13 @@
"> Character recognition from herbaria image samples."
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### *Smriti Suresh, Dima Kazlouski, Douglas Moy - 2023-10-06 v1.0.0-dev*"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -33,6 +40,17 @@
"See [project documentation](https://bu-spark.github.io/HerbariaOCR/) for the rendered documentation output."
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Overview\r\n",
+ "\r\n",
+ "The changing climate increases stressors that weaken plant resilience, disrupting forest structure and ecosystem services. Rising temperatures lead to more frequent droughts, wildfires, and invasive pest outbreaks, leading to the loss of plant species. That has numerous detrimental effects, including lowered productivity, the spread of invasive plants, vulnerability to pests, altered ecosystem structure, etc. The project aims to aid climate scientists in capturing patterns in plant life concerning changing climate.\r\n",
+ "The herbarium specimens are pressed plant samples stored on paper. The specimen labels are handwritten and date back to the early 1900s. The labels contain the curator's name, their institution, the species and genus, and the date the specimen was collected. Since the labels are handwritten, they are not readily accessible from an analytical standpoint. The data, at this time, cannot be analyzed to study the impact of climate on plant life.\r\n",
+ "The digitized samples are an invaluable source of information for climate change scientists, and are providing key insights into biodiversity change over the last century. Digitized specimens will facilitate easier dissemination of information and allow more people access to data. The project, if successful, would enable users from various domains in environmental science to further studies pertaining to climate change and its effects on flora and even fauna."
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -60,27 +78,21 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Fill me in please! Don't forget code examples:"
+ "Refer to the EDA tab for exploring the Herbaria OCR data sources"
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Refer to the Azure Vision tab for exploring the implementation of the OCR pipeline to obtain results from the models."
+ ]
+ },
+ {
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "2"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "1+1"
+ "Refer to the LLM Evaluations tabs for exploring the Evaluation metrics used as well as Accuracy results for English, Cyrillic and Chinese samples respectively."
]
},
{
@@ -99,13 +111,6 @@
"1. Contribute bug reports and feature requests by submitting [issues](https://github.com/BU-Spark/HerbariaOCR/issues) to the GitHub repo.\n",
"2. If you want to create Pull Requests with code changes, read the [contributing guide](https://github.com/BU-Spark/HerbariaOCR/blob/main/CONTRIBUTING.md) on the github repo."
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
diff --git a/nbs/sidebar.yml b/nbs/sidebar.yml
index 45222d2..4f1a02c 100644
--- a/nbs/sidebar.yml
+++ b/nbs/sidebar.yml
@@ -2,4 +2,7 @@ website:
sidebar:
contents:
- index.ipynb
- - 00_core.ipynb
+ - 00_AzureVision.ipynb
+ - 01_LLM_Evaluation.ipynb
+ - 02_LLM_Evaluation_Chinese.ipynb
+ - 03_LLM_Evaluation_Cyrilic.ipynb
diff --git a/nbs/styles.css b/nbs/styles.css
index 66ccc49..5c0856b 100644
--- a/nbs/styles.css
+++ b/nbs/styles.css
@@ -1,37 +1,37 @@
-.cell {
- margin-bottom: 1rem;
-}
-
-.cell > .sourceCode {
- margin-bottom: 0;
-}
-
-.cell-output > pre {
- margin-bottom: 0;
-}
-
-.cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre {
- margin-left: 0.8rem;
- margin-top: 0;
- background: none;
- border-left: 2px solid lightsalmon;
- border-top-left-radius: 0;
- border-top-right-radius: 0;
-}
-
-.cell-output > .sourceCode {
- border: none;
-}
-
-.cell-output > .sourceCode {
- background: none;
- margin-top: 0;
-}
-
-div.description {
- padding-left: 2px;
- padding-top: 5px;
- font-style: italic;
- font-size: 135%;
- opacity: 70%;
-}
+.cell {
+ margin-bottom: 1rem;
+}
+
+.cell > .sourceCode {
+ margin-bottom: 0;
+}
+
+.cell-output > pre {
+ margin-bottom: 0;
+}
+
+.cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre {
+ margin-left: 0.8rem;
+ margin-top: 0;
+ background: none;
+ border-left: 2px solid lightsalmon;
+ border-top-left-radius: 0;
+ border-top-right-radius: 0;
+}
+
+.cell-output > .sourceCode {
+ border: none;
+}
+
+.cell-output > .sourceCode {
+ background: none;
+ margin-top: 0;
+}
+
+div.description {
+ padding-left: 2px;
+ padding-top: 5px;
+ font-style: italic;
+ font-size: 135%;
+ opacity: 70%;
+}
diff --git a/settings.ini b/settings.ini
index 2ddee6b..429ee27 100644
--- a/settings.ini
+++ b/settings.ini
@@ -38,6 +38,6 @@ status = 3
user = BU-Spark
### Optional ###
-# requirements = fastcore pandas
+requirements = azure-ai-formrecognizer opencv-python-headless matplotlib pillow ipywidgets shapely openai reportlab taxonerd tqdm seaborn pandas
# dev_requirements =
# console_scripts =
\ No newline at end of file