-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdd2.py
133 lines (111 loc) · 5.13 KB
/
dd2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import streamlit as st
import pandas as pd
from charset_normalizer import from_bytes
import io
# Detect Encoding Function
def detect_encoding(uploaded_file):
"""
Detect the encoding of the uploaded file.
"""
raw_data = uploaded_file.read() # Read the file content as bytes
uploaded_file.seek(0) # Reset the pointer after reading
detected = from_bytes(raw_data).best()
return detected.encoding if detected else "utf-8"
# Function to style and wrap DataFrame
def style_dataframe_for_wrapping(df):
"""
Convert a DataFrame to an HTML table with styled text wrapping.
"""
return df.to_html(escape=False, index=False).replace(
"<table>",
"""<table style="word-wrap: break-word; table-layout: fixed; width: 100%; border-collapse: collapse;">
<style>
td, th {
word-wrap: break-word;
max-width: 200px; /* Adjust as needed */
white-space: normal;
overflow-wrap: break-word;
padding: 8px;
}
</style>
"""
)
# Main Streamlit Application
def run():
st.title("Deduplication FTW")
# File Upload Section
uploaded_file = st.file_uploader("Upload your Python or CSV file:", type=["py", "csv"])
df = None # Initialize df to avoid potential UnboundLocalError
if uploaded_file is None:
st.info("Please upload a CSV file to get started.")
return
if uploaded_file is not None:
try:
# Detect encoding
encoding = detect_encoding(uploaded_file)
st.success(f"Detected encoding: {encoding}")
# Load and display the file if it's a CSV
if uploaded_file.name.endswith(".csv"):
df = pd.read_csv(uploaded_file, encoding=encoding)
st.write("Preview of the uploaded CSV:")
# Toggle box to show entire DataFrame or head
show_all = st.checkbox("Show entire DataFrame", value=False)
if show_all:
st.write("Displaying entire DataFrame:")
st.dataframe(df)
else:
st.write("Displaying top rows of DataFrame:")
st.dataframe(df.head())
else:
st.write("Uploaded file is not a CSV. Only encoding was detected.")
except Exception as e:
st.error(f"An error occurred while processing the file: {e}")
# Deduplication Section
if df is not None:
st.markdown("## Deduplication Options")
dedupe_fields = st.multiselect("Choose fields to deduplicate:", options=df.columns.tolist())
if dedupe_fields:
deduplicated_df = df.drop_duplicates(subset=dedupe_fields)
st.markdown("### Data after Deduplication")
# Predefined list of fields to display
predefined_fields = ["stageTimestamp", "stage", "requestURI", "verb",
"user", "sourceIPs", "userAgent", "objectRef",
"responseStatus", "annotations"] # Replace with your desired field names
# Validate predefined fields exist in the DataFrame
valid_fields = [field for field in predefined_fields if field in deduplicated_df.columns]
# Add checkbox to toggle between predefined fields and all fields
show_all_fields = st.checkbox("Show all fields", value=False)
if show_all_fields:
# Use all fields in the DataFrame
filtered_df = deduplicated_df.head
else:
if valid_fields:
# Filter DataFrame to include only the valid predefined fields
filtered_df = deduplicated_df[valid_fields].head
else:
st.warning("None of the predefined fields exist in the dataset. Please check your field names.")
filtered_df = pd.DataFrame() # Empty DataFrame for fallback
# Add Export Button
if not filtered_df.empty:
# Convert DataFrame to CSV in binary format
csv_buffer = io.BytesIO()
filtered_df.to_csv(csv_buffer, index=False)
csv_buffer.seek(0)
st.download_button(
label="Export Deduplicated Data to CSV",
data=csv_buffer.getvalue(), # Get the binary content
file_name="deduplicated_data.csv",
mime="text/csv"
)
# Convert to styled HTML table
if not filtered_df.empty:
styled_table = style_dataframe_for_wrapping(filtered_df)
st.markdown(styled_table, unsafe_allow_html=True)
else:
st.warning("None of the predefined fields exist in the dataset. Please check your field names.")
else:
st.info("Select fields for deduplication to see the result.")
else:
st.info("Please upload a CSV file to enable deduplication options.")
if __name__ == "__main__":
run()