Skip to content

Commit

Permalink
[Place Page SEO] Add new sitemap for testing (#3898)
Browse files Browse the repository at this point in the history
Add a new sitemap consisting of 50 US States, Washington DC, and the top
100 US cities by population. This sitemap can be used as a "test set"
for our SEO changes, and allows for us to more easily request
re-indexing and track indexing over time for just this priority set of
places.

This PR contains both the new sitemap, an updated robots.txt, and the
code used to generate the sitemap.
  • Loading branch information
juliawu committed Jan 25, 2024
1 parent 4a2ed43 commit 48fb61a
Show file tree
Hide file tree
Showing 4 changed files with 200 additions and 0 deletions.
1 change: 1 addition & 0 deletions static/robots.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Sitemap: https://datacommons.org/sitemap/CensusCoreBasedStatisticalArea.0.txt
Sitemap: https://datacommons.org/sitemap/StateComponent.0.txt
Sitemap: https://datacommons.org/sitemap/AdministrativeArea2.0.txt
Sitemap: https://datacommons.org/sitemap/AdministrativeArea2.1.txt
Sitemap: https://datacommons.org/sitemap/PriorityPlaces.0.txt
Sitemap: https://datacommons.org/sitemap/AdministrativeArea4.3.txt
Sitemap: https://datacommons.org/sitemap/AdministrativeArea4.2.txt
Sitemap: https://datacommons.org/sitemap/Continent.0.txt
Expand Down
151 changes: 151 additions & 0 deletions static/sitemap/PriorityPlaces.0.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
https://datacommons.org/place/geoId/01
https://datacommons.org/place/geoId/02
https://datacommons.org/place/geoId/04
https://datacommons.org/place/geoId/05
https://datacommons.org/place/geoId/06
https://datacommons.org/place/geoId/08
https://datacommons.org/place/geoId/09
https://datacommons.org/place/geoId/10
https://datacommons.org/place/geoId/11
https://datacommons.org/place/geoId/12
https://datacommons.org/place/geoId/13
https://datacommons.org/place/geoId/15
https://datacommons.org/place/geoId/16
https://datacommons.org/place/geoId/17
https://datacommons.org/place/geoId/18
https://datacommons.org/place/geoId/19
https://datacommons.org/place/geoId/20
https://datacommons.org/place/geoId/21
https://datacommons.org/place/geoId/22
https://datacommons.org/place/geoId/23
https://datacommons.org/place/geoId/24
https://datacommons.org/place/geoId/25
https://datacommons.org/place/geoId/26
https://datacommons.org/place/geoId/27
https://datacommons.org/place/geoId/28
https://datacommons.org/place/geoId/29
https://datacommons.org/place/geoId/30
https://datacommons.org/place/geoId/31
https://datacommons.org/place/geoId/32
https://datacommons.org/place/geoId/33
https://datacommons.org/place/geoId/34
https://datacommons.org/place/geoId/35
https://datacommons.org/place/geoId/36
https://datacommons.org/place/geoId/37
https://datacommons.org/place/geoId/38
https://datacommons.org/place/geoId/39
https://datacommons.org/place/geoId/40
https://datacommons.org/place/geoId/41
https://datacommons.org/place/geoId/42
https://datacommons.org/place/geoId/44
https://datacommons.org/place/geoId/45
https://datacommons.org/place/geoId/46
https://datacommons.org/place/geoId/47
https://datacommons.org/place/geoId/48
https://datacommons.org/place/geoId/49
https://datacommons.org/place/geoId/50
https://datacommons.org/place/geoId/51
https://datacommons.org/place/geoId/53
https://datacommons.org/place/geoId/54
https://datacommons.org/place/geoId/55
https://datacommons.org/place/geoId/56
https://datacommons.org/place/geoId/3651000
https://datacommons.org/place/geoId/0644000
https://datacommons.org/place/geoId/1714000
https://datacommons.org/place/geoId/4835000
https://datacommons.org/place/geoId/0455000
https://datacommons.org/place/geoId/4260000
https://datacommons.org/place/geoId/4865000
https://datacommons.org/place/geoId/0666000
https://datacommons.org/place/geoId/4819000
https://datacommons.org/place/geoId/0668000
https://datacommons.org/place/geoId/4805000
https://datacommons.org/place/geoId/4827000
https://datacommons.org/place/geoId/1235000
https://datacommons.org/place/geoId/3918000
https://datacommons.org/place/geoId/3712000
https://datacommons.org/place/geoId/0667000
https://datacommons.org/place/geoId/1836003
https://datacommons.org/place/geoId/3634000
https://datacommons.org/place/geoId/5363000
https://datacommons.org/place/geoId/0820000
https://datacommons.org/place/geoId/1150000
https://datacommons.org/place/geoId/2507000
https://datacommons.org/place/geoId/4752000
https://datacommons.org/place/geoId/4824000
https://datacommons.org/place/geoId/2622000
https://datacommons.org/place/geoId/4752006
https://datacommons.org/place/geoId/4055000
https://datacommons.org/place/geoId/4159000
https://datacommons.org/place/geoId/3240000
https://datacommons.org/place/geoId/4748000
https://datacommons.org/place/geoId/2148006
https://datacommons.org/place/geoId/2404000
https://datacommons.org/place/geoId/5553000
https://datacommons.org/place/geoId/3502000
https://datacommons.org/place/geoId/0477000
https://datacommons.org/place/geoId/0627000
https://datacommons.org/place/geoId/0446000
https://datacommons.org/place/geoId/0664000
https://datacommons.org/place/geoId/1304000
https://datacommons.org/place/geoId/2938000
https://datacommons.org/place/geoId/3610000
https://datacommons.org/place/geoId/0816000
https://datacommons.org/place/geoId/3137000
https://datacommons.org/place/geoId/3755000
https://datacommons.org/place/geoId/1245000
https://datacommons.org/place/geoId/0643000
https://datacommons.org/place/geoId/5182000
https://datacommons.org/place/geoId/0653000
https://datacommons.org/place/geoId/2743000
https://datacommons.org/place/geoId/4075000
https://datacommons.org/place/geoId/1271000
https://datacommons.org/place/geoId/4804000
https://datacommons.org/place/geoId/2255000
https://datacommons.org/place/geoId/2079000
https://datacommons.org/place/geoId/0603526
https://datacommons.org/place/geoId/3916000
https://datacommons.org/place/geoId/0804000
https://datacommons.org/place/geoId/1517000
https://datacommons.org/place/geoId/0602000
https://datacommons.org/place/geoId/1571550
https://datacommons.org/place/geoId/0669000
https://datacommons.org/place/geoId/0662000
https://datacommons.org/place/geoId/3638000
https://datacommons.org/place/geoId/4817000
https://datacommons.org/place/geoId/2146027
https://datacommons.org/place/geoId/3231900
https://datacommons.org/place/geoId/0675000
https://datacommons.org/place/geoId/7276770
https://datacommons.org/place/geoId/2758000
https://datacommons.org/place/geoId/3915000
https://datacommons.org/place/geoId/2965000
https://datacommons.org/place/geoId/4261000
https://datacommons.org/place/geoId/3656000
https://datacommons.org/place/geoId/3728000
https://datacommons.org/place/geoId/3128000
https://datacommons.org/place/geoId/0203000
https://datacommons.org/place/geoId/1253000
https://datacommons.org/place/geoId/0636770
https://datacommons.org/place/geoId/4858016
https://datacommons.org/place/geoId/3451000
https://datacommons.org/place/geoId/3719000
https://datacommons.org/place/geoId/0613392
https://datacommons.org/place/geoId/3977000
https://datacommons.org/place/geoId/1263000
https://datacommons.org/place/geoId/4841464
https://datacommons.org/place/geoId/1825000
https://datacommons.org/place/geoId/3436000
https://datacommons.org/place/geoId/0412000
https://datacommons.org/place/geoId/5548000
https://datacommons.org/place/geoId/4845000
https://datacommons.org/place/geoId/0465000
https://datacommons.org/place/geoId/3260600
https://datacommons.org/place/geoId/3611000
https://datacommons.org/place/geoId/0427400
https://datacommons.org/place/geoId/0427820
https://datacommons.org/place/geoId/3251800
https://datacommons.org/place/geoId/3775000
https://datacommons.org/place/geoId/2148000
https://datacommons.org/place/geoId/5116000
https://datacommons.org/place/geoId/5157000
47 changes: 47 additions & 0 deletions tools/sitemap/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import time

import datacommons as dc
import requests

logging.getLogger().setLevel(logging.INFO)

Expand Down Expand Up @@ -87,6 +88,51 @@ def write_place_url(place_type):
time.sleep(10)


def write_priority_places_sitemap():
"""Write a custom sitemap for SEO testing.
Writes a sitemap with 50 US states, Washington D.C., and the top 100
US cities by population.
"""
# Get US states and Washington DC
sparql = '''
SELECT ?dcid
WHERE {
?a typeOf State .
?a dcid ?dcid
}
Order By ASC(?dcid)
LIMIT 51
'''
dcids = []
try:
state_data = dc.query(sparql)
for state in state_data:
state_dcid = state.get('?dcid', None)
if state_dcid:
dcids.append(state_dcid)
except Exception:
logging.exception('Got an error while querying for US states')
return

# Get Top 100 US cities by population from ranking API
response = requests.get(
"https://datacommons.org/api/ranking/Count_Person/City/country/USA")
city_ranking_data = response.json().get("Count_Person",
{}).get("rankTop1000",
{}).get("info", [])
for city in city_ranking_data[:100]:
city_dcid = city.get("placeDcid", None)
if city_dcid:
dcids.append(city_dcid)

# Write to file
sitemap_location = os.path.join(SAVE_PATH, "PriorityPlaces.0.txt")
with open(sitemap_location, "w") as f:
for place_dcid in dcids:
f.write(SITE_PREFIX + place_dcid + '\n')


def updateRobotTxt():
with open('../../static/robots.txt', 'w') as robot:
for f in os.listdir(SAVE_PATH):
Expand All @@ -97,6 +143,7 @@ def main():
dc.set_api_key('noop')
for place_type in PLACES:
write_place_url(place_type)
write_priority_places_sitemap()
updateRobotTxt()


Expand Down
1 change: 1 addition & 0 deletions tools/sitemap/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
datacommons
requests
yapf

0 comments on commit 48fb61a

Please sign in to comment.