From 96cd8834891c2ceb266b8303a92351fdd21b1cbe Mon Sep 17 00:00:00 2001 From: Motouom_Wandjeu_Victoire_Josue Date: Sat, 2 Dec 2023 10:01:01 +0100 Subject: [PATCH 1/2] feature(testing_ananlysis): testing for data analysis of the scrapped data. --- src/analysis.py | 66 +++++++++++++++++++++++++ src/test_analysis.ipynb | 103 ++++++++++++++++++++++++++++++++++++++++ src/test_analysis.py | 67 ++++++++++++++++++++++++++ 3 files changed, 236 insertions(+) create mode 100644 src/analysis.py create mode 100644 src/test_analysis.ipynb create mode 100644 src/test_analysis.py diff --git a/src/analysis.py b/src/analysis.py new file mode 100644 index 0000000..bac087f --- /dev/null +++ b/src/analysis.py @@ -0,0 +1,66 @@ +import json +import pandas as pd +import matplotlib.pyplot as plt +from io import StringIO +import csv + +#code +def perform_analyses(data_file): + try: + # Open file in read mode + data = 'data.csv' + + with open(data, 'r') as f: + file_content = f.read() + + + # Try to parse as JSON + try: + data = json.loads(file_content) + is_json = True + except json.JSONDecodeError: + is_json = False + + # If JSON parsing failed, try parsing as CSV + if not is_json: + try: + # If the file has headers + data = list(csv.DictReader(StringIO(file_content))) + is_csv = True + except csv.Error: + is_csv = False + + if not is_json and not is_csv: + raise ValueError("The file format is not supported.") + + if is_json and not isinstance(data, list): + raise ValueError("The data should be formatted as a list of objects.") + + df = pd.DataFrame(data) + + # Display basic statistics + print("Total Websites Analyzed:", df.shape[0]) + print("Average character count:", df['char_count'].mean()) + print("Average image count:", df['image_count'].mean()) + + # plot character count histogram + plt.figure(figsize=(10, 5)) + df['char_count'].hist(bins=50) + plt.title('Character count histogram') + plt.xlabel('Character count') + plt.ylabel('Frequency') + plt.show() + + # plot image count histogram + plt.figure(figsize=(10, 5)) + df['image_count'].hist(bins=50) + plt.title('Image count histogram') + plt.xlabel('Image count') + plt.ylabel('Frequency') + plt.show() + + except ValueError as e: + print(e) + + except FileNotFoundError: + print("The file was not found") \ No newline at end of file diff --git a/src/test_analysis.ipynb b/src/test_analysis.ipynb new file mode 100644 index 0000000..eb281e7 --- /dev/null +++ b/src/test_analysis.ipynb @@ -0,0 +1,103 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from io import StringIO\n", + "import csv\n", + "import unittest" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def perform_analyses(data_file):\n", + " try:\n", + " # Open file in read mode\n", + " with open(data_file, 'r') as f:\n", + " file_content = f.read()\n", + "\n", + " # Try to parse as JSON\n", + " try:\n", + " data = json.loads(file_content)\n", + " is_json = True\n", + " except json.JSONDecodeError:\n", + " is_json = False\n", + "\n", + " # If JSON parsing failed, try parsing as CSV\n", + " if not is_json:\n", + " try:\n", + " # If the file has headers\n", + " data = list(csv.DictReader(StringIO(file_content)))\n", + " is_csv = True\n", + " except csv.Error:\n", + " is_csv = False\n", + "\n", + " if not is_json and not is_csv:\n", + " raise ValueError(\"The file format is not supported.\")\n", + "\n", + " if is_json and not isinstance(data, list):\n", + " raise ValueError(\"The data should be formatted as a list of objects.\")\n", + "\n", + " df = pd.DataFrame(data)\n", + "\n", + " # Display basic statistics\n", + " print(\"Total Websites Analyzed:\", df.shape[0])\n", + " print(\"Average character count:\", str(df['char_count'].mean()))\n", + " print(\"Average image count:\", str(df['image_count'].mean()))\n", + "\n", + " # plot character count histogram\n", + " plt.figure(figsize=(10, 5))\n", + " df['char_count'].hist(bins=50)\n", + " plt.title('Character count histogram')\n", + " plt.xlabel('Character count')\n", + " plt.ylabel('Frequency')\n", + " plt.show()\n", + "\n", + " # plot image count histogram\n", + " plt.figure(figsize=(10, 5))\n", + " df['image_count'].hist(bins=50)\n", + " plt.title('Image count histogram')\n", + " plt.xlabel('Image count')\n", + " plt.ylabel('Frequency')\n", + " plt.show()\n", + "\n", + " except ValueError as e:\n", + " print(e)\n", + "\n", + " except FileNotFoundError:\n", + " print(\"The file was not found\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/test_analysis.py b/src/test_analysis.py new file mode 100644 index 0000000..40d52f7 --- /dev/null +++ b/src/test_analysis.py @@ -0,0 +1,67 @@ +import json +import pandas as pd +import matplotlib.pyplot as plt +from io import StringIO +import csv +import unittest + +# code +def perform_analyses(data_file): + try: + # Open file in read mode + with open(data_file, 'r') as f: + file_content = f.read() + + # Try to parse as JSON + try: + data = json.loads(file_content) + is_json = True + except json.JSONDecodeError: + is_json = False + + # If JSON parsing failed, try parsing as CSV + if not is_json: + try: + # If the file has headers + data = list(csv.DictReader(StringIO(file_content))) + is_csv = True + except csv.Error: + is_csv = False + + if not is_json and not is_csv: + raise ValueError("The file format is not supported.") + + if is_json and not isinstance(data, list): + raise ValueError("The data should be formatted as a list of objects.") + + df = pd.DataFrame(data) + + # Display basic statistics + print("Total Websites Analyzed:", df.shape[0]) + print("Average character count:", str(df['char_count'].mean())) + print("Average image count:", str(df['image_count'].mean())) + + # plot character count histogram + plt.figure(figsize=(10, 5)) + df['char_count'].hist(bins=50) + plt.title('Character count histogram') + plt.xlabel('Character count') + plt.ylabel('Frequency') + plt.show() + + # plot image count histogram + plt.figure(figsize=(10, 5)) + df['image_count'].hist(bins=50) + plt.title('Image count histogram') + plt.xlabel('Image count') + plt.ylabel('Frequency') + plt.show() + + except ValueError as e: + print(e) + + except FileNotFoundError: + print("The file was not found") + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From 9df0c753f01c869ba0ffa434fa96f56b1552236c Mon Sep 17 00:00:00 2001 From: Motouom_Wandjeu_Victoire_Josue Date: Mon, 4 Dec 2023 10:11:22 +0100 Subject: [PATCH 2/2] feature(testing_ananlysis): testing for analysed data that was scrapped from the desired website --- src/analyze_data.py | 44 ------------------------ test/test _data_analysis.py | 67 +++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 44 deletions(-) delete mode 100644 src/analyze_data.py create mode 100644 test/test _data_analysis.py diff --git a/src/analyze_data.py b/src/analyze_data.py deleted file mode 100644 index 514993d..0000000 --- a/src/analyze_data.py +++ /dev/null @@ -1,44 +0,0 @@ -from sklearn.cluster import KMeans - -def analyze_data(data): - if data is not None and 'Data' in data.columns: - # Calculate average word count and image count - data['WordCount'] = data['Data'].apply(lambda x: len(re.findall(r'\b\w+\b', str(x)))) - data['ImageCount'] = data['Data'].apply(count_images) - - # Check if 'WordCount' and 'ImageCount' columns exist in the DataFrame - if 'WordCount' in data.columns and 'ImageCount' in data.columns: - average_word_count = data['WordCount'].mean() - average_image_count = data['ImageCount'].mean() - - return {'average_word_count': average_word_count, 'average_image_count': average_image_count} - else: - return None - else: - return None - - -def count_images(text): - img_tags = re.findall(r']+>', str(text)) - return len(img_tags) - - -def perform_cluster_analysis(data): - # Analyze the data - # TODO Why are we calling analyze_data() here? - analysis_result = analyze_data(data) - - # Extract relevant features for clustering - features = data[['WordCount', 'ImageCount']] - - # Check if there are any samples to cluster - if not features.empty: - # Use K-Means clustering - kmeans = KMeans(n_clusters=3, random_state=42) - data['Cluster'] = kmeans.fit_predict(features) - - return data[['Data', 'Cluster']] - else: - # Handle the case when there are no samples to cluster - return None - diff --git a/test/test _data_analysis.py b/test/test _data_analysis.py new file mode 100644 index 0000000..40d52f7 --- /dev/null +++ b/test/test _data_analysis.py @@ -0,0 +1,67 @@ +import json +import pandas as pd +import matplotlib.pyplot as plt +from io import StringIO +import csv +import unittest + +# code +def perform_analyses(data_file): + try: + # Open file in read mode + with open(data_file, 'r') as f: + file_content = f.read() + + # Try to parse as JSON + try: + data = json.loads(file_content) + is_json = True + except json.JSONDecodeError: + is_json = False + + # If JSON parsing failed, try parsing as CSV + if not is_json: + try: + # If the file has headers + data = list(csv.DictReader(StringIO(file_content))) + is_csv = True + except csv.Error: + is_csv = False + + if not is_json and not is_csv: + raise ValueError("The file format is not supported.") + + if is_json and not isinstance(data, list): + raise ValueError("The data should be formatted as a list of objects.") + + df = pd.DataFrame(data) + + # Display basic statistics + print("Total Websites Analyzed:", df.shape[0]) + print("Average character count:", str(df['char_count'].mean())) + print("Average image count:", str(df['image_count'].mean())) + + # plot character count histogram + plt.figure(figsize=(10, 5)) + df['char_count'].hist(bins=50) + plt.title('Character count histogram') + plt.xlabel('Character count') + plt.ylabel('Frequency') + plt.show() + + # plot image count histogram + plt.figure(figsize=(10, 5)) + df['image_count'].hist(bins=50) + plt.title('Image count histogram') + plt.xlabel('Image count') + plt.ylabel('Frequency') + plt.show() + + except ValueError as e: + print(e) + + except FileNotFoundError: + print("The file was not found") + +if __name__ == '__main__': + unittest.main() \ No newline at end of file