Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature(testing_ananlysis): testing for data analysis of the scrapped… #119

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions src/analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import json
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO
import csv

#code
def perform_analyses(data_file):
try:
# Open file in read mode
data = 'data.csv'

with open(data, 'r') as f:
file_content = f.read()


# Try to parse as JSON
try:
data = json.loads(file_content)
is_json = True
except json.JSONDecodeError:
is_json = False

# If JSON parsing failed, try parsing as CSV
if not is_json:
try:
# If the file has headers
data = list(csv.DictReader(StringIO(file_content)))
is_csv = True
except csv.Error:
is_csv = False

if not is_json and not is_csv:
raise ValueError("The file format is not supported.")

if is_json and not isinstance(data, list):
raise ValueError("The data should be formatted as a list of objects.")

df = pd.DataFrame(data)

# Display basic statistics
print("Total Websites Analyzed:", df.shape[0])
print("Average character count:", df['char_count'].mean())
print("Average image count:", df['image_count'].mean())

# plot character count histogram
plt.figure(figsize=(10, 5))
df['char_count'].hist(bins=50)
plt.title('Character count histogram')
plt.xlabel('Character count')
plt.ylabel('Frequency')
plt.show()

# plot image count histogram
plt.figure(figsize=(10, 5))
df['image_count'].hist(bins=50)
plt.title('Image count histogram')
plt.xlabel('Image count')
plt.ylabel('Frequency')
plt.show()

except ValueError as e:
print(e)

except FileNotFoundError:
print("The file was not found")
44 changes: 0 additions & 44 deletions src/analyze_data.py

This file was deleted.

103 changes: 103 additions & 0 deletions src/test_analysis.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from io import StringIO\n",
"import csv\n",
"import unittest"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def perform_analyses(data_file):\n",
" try:\n",
" # Open file in read mode\n",
" with open(data_file, 'r') as f:\n",
" file_content = f.read()\n",
"\n",
" # Try to parse as JSON\n",
" try:\n",
" data = json.loads(file_content)\n",
" is_json = True\n",
" except json.JSONDecodeError:\n",
" is_json = False\n",
"\n",
" # If JSON parsing failed, try parsing as CSV\n",
" if not is_json:\n",
" try:\n",
" # If the file has headers\n",
" data = list(csv.DictReader(StringIO(file_content)))\n",
" is_csv = True\n",
" except csv.Error:\n",
" is_csv = False\n",
"\n",
" if not is_json and not is_csv:\n",
" raise ValueError(\"The file format is not supported.\")\n",
"\n",
" if is_json and not isinstance(data, list):\n",
" raise ValueError(\"The data should be formatted as a list of objects.\")\n",
"\n",
" df = pd.DataFrame(data)\n",
"\n",
" # Display basic statistics\n",
" print(\"Total Websites Analyzed:\", df.shape[0])\n",
" print(\"Average character count:\", str(df['char_count'].mean()))\n",
" print(\"Average image count:\", str(df['image_count'].mean()))\n",
"\n",
" # plot character count histogram\n",
" plt.figure(figsize=(10, 5))\n",
" df['char_count'].hist(bins=50)\n",
" plt.title('Character count histogram')\n",
" plt.xlabel('Character count')\n",
" plt.ylabel('Frequency')\n",
" plt.show()\n",
"\n",
" # plot image count histogram\n",
" plt.figure(figsize=(10, 5))\n",
" df['image_count'].hist(bins=50)\n",
" plt.title('Image count histogram')\n",
" plt.xlabel('Image count')\n",
" plt.ylabel('Frequency')\n",
" plt.show()\n",
"\n",
" except ValueError as e:\n",
" print(e)\n",
"\n",
" except FileNotFoundError:\n",
" print(\"The file was not found\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
67 changes: 67 additions & 0 deletions src/test_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import json
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO
import csv
import unittest

# code
def perform_analyses(data_file):
try:
# Open file in read mode
with open(data_file, 'r') as f:
file_content = f.read()

# Try to parse as JSON
try:
data = json.loads(file_content)
is_json = True
except json.JSONDecodeError:
is_json = False

# If JSON parsing failed, try parsing as CSV
if not is_json:
try:
# If the file has headers
data = list(csv.DictReader(StringIO(file_content)))
is_csv = True
except csv.Error:
is_csv = False

if not is_json and not is_csv:
raise ValueError("The file format is not supported.")

if is_json and not isinstance(data, list):
raise ValueError("The data should be formatted as a list of objects.")

df = pd.DataFrame(data)

# Display basic statistics
print("Total Websites Analyzed:", df.shape[0])
print("Average character count:", str(df['char_count'].mean()))
print("Average image count:", str(df['image_count'].mean()))

# plot character count histogram
plt.figure(figsize=(10, 5))
df['char_count'].hist(bins=50)
plt.title('Character count histogram')
plt.xlabel('Character count')
plt.ylabel('Frequency')
plt.show()

# plot image count histogram
plt.figure(figsize=(10, 5))
df['image_count'].hist(bins=50)
plt.title('Image count histogram')
plt.xlabel('Image count')
plt.ylabel('Frequency')
plt.show()

except ValueError as e:
print(e)

except FileNotFoundError:
print("The file was not found")

if __name__ == '__main__':
unittest.main()
67 changes: 67 additions & 0 deletions test/test _data_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import json
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO
import csv
import unittest

# code
def perform_analyses(data_file):
try:
# Open file in read mode
with open(data_file, 'r') as f:
file_content = f.read()

# Try to parse as JSON
try:
data = json.loads(file_content)
is_json = True
except json.JSONDecodeError:
is_json = False

# If JSON parsing failed, try parsing as CSV
if not is_json:
try:
# If the file has headers
data = list(csv.DictReader(StringIO(file_content)))
is_csv = True
except csv.Error:
is_csv = False

if not is_json and not is_csv:
raise ValueError("The file format is not supported.")

if is_json and not isinstance(data, list):
raise ValueError("The data should be formatted as a list of objects.")

df = pd.DataFrame(data)

# Display basic statistics
print("Total Websites Analyzed:", df.shape[0])
print("Average character count:", str(df['char_count'].mean()))
print("Average image count:", str(df['image_count'].mean()))

# plot character count histogram
plt.figure(figsize=(10, 5))
df['char_count'].hist(bins=50)
plt.title('Character count histogram')
plt.xlabel('Character count')
plt.ylabel('Frequency')
plt.show()

# plot image count histogram
plt.figure(figsize=(10, 5))
df['image_count'].hist(bins=50)
plt.title('Image count histogram')
plt.xlabel('Image count')
plt.ylabel('Frequency')
plt.show()

except ValueError as e:
print(e)

except FileNotFoundError:
print("The file was not found")

if __name__ == '__main__':
unittest.main()