forked from snehankekre/st-codespaces
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwikidata.py
181 lines (143 loc) · 6.36 KB
/
wikidata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import requests
import pandas as pd
import streamlit as st
import re
def clean_text(text):
if pd.isna(text):
return ""
# Convert to string if not already
text = str(text)
# Remove common list markers (numbers, letters, bullets)
text = re.sub(r'^\s*[\(\[\{]?[0-9a-zA-Z][\)\]\}]?\s*[\.,-]\s*', '', text)
text = re.sub(r'^\s*[\-\*\•\◦\○\⚬]+\s*', '', text)
# Remove any remaining parentheses, brackets, and their contents
text = re.sub(r'\([^)]*\)', '', text)
text = re.sub(r'\[[^\]]*\]', '', text)
text = re.sub(r'\{[^}]*\}', '', text)
# Remove special characters but keep hyphens within words
text = re.sub(r'[^a-zA-Z0-9\s\-]', ' ', text)
# Replace multiple hyphens with single hyphen
text = re.sub(r'-+', '-', text)
# Remove hyphens at start or end
text = re.sub(r'^-+|-+$', '', text)
# Replace multiple spaces with single space
text = re.sub(r'\s+', ' ', text)
# Strip whitespace
text = text.strip()
return text
def clean_dataframe(df):
# Make a copy to avoid modifying the original
df_cleaned = df.copy()
# Clean the 'term' column
if 'term' in df_cleaned.columns:
df_cleaned['term'] = df_cleaned['term'].apply(clean_text)
# Remove empty rows
df_cleaned = df_cleaned[df_cleaned['term'].str.len() > 0]
# Remove duplicates
df_cleaned = df_cleaned.drop_duplicates(subset=['term'])
return df_cleaned
def search_wikidata(query, limit=5):
url = "https://www.wikidata.org/w/api.php"
params = {
"action": "wbsearchentities",
"format": "json",
"language": "en",
"type": "item",
"limit": limit,
"search": query
}
response = requests.get(url, params=params)
data = response.json()
results = []
for result in data.get("search", []):
results.append({
"id": result["id"],
"label": result.get("label", "No label"),
"description": result.get("description", "No description available")
})
return results
def main():
st.title("Legal Concepts Wikidata Mapper")
st.markdown("""
## Welcome to the Legal Concepts Wikidata Mapper!
This app helps you map legal terms and concepts to their corresponding Wikidata entities.
It includes automatic data cleaning to handle messy input data!
### Data Cleaning Features:
- Removes list markers (numbers, bullets, etc.)
- Strips unnecessary punctuation and special characters
- Removes empty rows and duplicates
- Cleans up extra spaces and formatting
- Preserves hyphens within words
### How it works:
1. **Upload a CSV file**: Your file should contain a column named 'term' with the legal terms you want to map.
2. **Data Cleaning**: The app automatically cleans your data.
3. **Review Changes**: You can see both original and cleaned data.
4. **Search Wikidata**: For each cleaned term, the app searches Wikidata.
5. **Select matches**: Choose the most appropriate Wikidata entity for each term.
6. **Download results**: Get a new CSV file with the Wikidata IDs added.
""")
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
if uploaded_file is not None:
# Read original data
df_original = pd.read_csv(uploaded_file)
if 'term' not in df_original.columns:
st.error("The CSV file must contain a 'term' column. Please check your file and try again.")
return
# Clean the data
df_cleaned = clean_dataframe(df_original)
# Show data cleaning results
st.markdown("### Data Cleaning Results")
col1, col2 = st.columns(2)
with col1:
st.write("Original Data:")
st.write(df_original[['term']].head())
with col2:
st.write("Cleaned Data:")
st.write(df_cleaned[['term']].head())
st.write(f"Rows before cleaning: {len(df_original)}")
st.write(f"Rows after cleaning: {len(df_cleaned)}")
if len(df_original) != len(df_cleaned):
st.info(f"Removed {len(df_original) - len(df_cleaned)} duplicate or empty rows during cleaning.")
proceed = st.button("Proceed with Cleaned Data")
if proceed:
st.markdown("### Mapping Process")
st.write("For each cleaned term, we'll search Wikidata and ask you to select the best match.")
results = {}
for term in df_cleaned['term']:
with st.expander(f"Map '{term}'"):
st.write(f"Searching Wikidata for '{term}'...")
wikidata_results = search_wikidata(term)
if wikidata_results:
options = [f"{r['id']} - {r['label']} ({r['description']})" for r in wikidata_results]
selected = st.selectbox(
f"Select the appropriate Wikidata entity for '{term}':",
options + ['None of the above']
)
if selected != 'None of the above':
results[term] = selected.split(' - ')[0]
st.success(f"Mapped '{term}' to Wikidata entity {results[term]}")
else:
results[term] = ''
st.info("Term will be left unmapped")
else:
st.write("No results found. Term will be left unmapped.")
results[term] = ''
df_cleaned['wikidata_id'] = df_cleaned['term'].map(results)
st.markdown("### Final Results")
st.write(df_cleaned)
csv = df_cleaned.to_csv(index=False)
st.download_button(
label="Download mapped CSV",
data=csv,
file_name="mapped_legal_terms.csv",
mime="text/csv",
)
st.markdown("""
### Next Steps
Your cleaned and mapped data is ready! You can now:
1. Use these Wikidata IDs in your systems or documents
2. Explore the Wikidata pages for additional information
3. Use this mapping for data integration projects
""")
if __name__ == "__main__":
main()