Skip to content

Commit

Permalink
Added the possibility to exclude datasets that should not be indexed
Browse files Browse the repository at this point in the history
simeonwetzel committed Aug 8, 2024
1 parent d98a57d commit c992e3a
Showing 3 changed files with 27 additions and 8 deletions.
10 changes: 9 additions & 1 deletion search-app/server/config/config.json
Original file line number Diff line number Diff line change
@@ -3,9 +3,17 @@
"TAVILY_API_KEY": "",
"SDSA_API_KEY": "demo-api-key",
"pygeoapi_instances": [
{"url": "https://api.weather.gc.ca/",
"exclude_collections": [
"https://api.weather.gc.ca/collections/climate:dcs:historical:seasonal:absolute?lang=en",
"https://api.weather.gc.ca/collections/climate:dcs:projected:monthly:absolute?lang=en"
]
}
],
"web_geojson_resources": [
"https://webais.demo.52north.org/pygeoapi"
],
"local_files": [
"local_geojson_files": [
"./data/"
]
}
2 changes: 1 addition & 1 deletion search-app/server/connectors/geojson_osm.py
Original file line number Diff line number Diff line change
@@ -70,7 +70,7 @@ def __init__(self, file_dir: str = None, tag_name: str = "building"):
self.features = self._filter_meaningful_features(gj, tag_name)
else:
if not file_dir:
file_dir = config.local_files
file_dir = config.local_geojson_files
logging.info(f"Looking for files in following dir: {file_dir[0]}")
gj_files = []
for file in glob.glob(f"{file_dir[0]}*.geojson"):
23 changes: 17 additions & 6 deletions search-app/server/connectors/pygeoapi_retriever.py
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@
from typing import List, Dict
from langchain.schema import Document
from config.config import Config
import re

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
@@ -15,7 +16,7 @@ def __init__(self, urls: List[str] = None):
if urls:
self.urls = urls
else:
self.urls = config.pygeoapi_instances
self.instances = config.pygeoapi_instances

async def _get_queryables(self, session: aiohttp.ClientSession, collection_id: str, base_url: str) -> dict:
"""
@@ -32,16 +33,26 @@ async def _get_queryables(self, session: aiohttp.ClientSession, collection_id: s
return queryables['properties']
return {}

async def _get_collections(self, base_url: str) -> List[dict]:
async def _get_collections(self, instance) -> List[dict]:
"""
Get all collections of a pygeoapi instance asynchronously.
"""
base_url = instance["url"]
logging.info(f"Fetching collections of pygeoapi instance: {base_url}")

exclude_urls = instance["exclude_collections"]
pattern = r'collections/([^?]+)'
exclude_collections = list(map(lambda url: re.search(pattern, url).group(1), exclude_urls))

logging.info(f"Excluding following collections from indexing operation: {exclude_collections}")


async with aiohttp.ClientSession() as session:
async with session.get(f'{base_url}/collections/') as response:
if response.status == 200:
collections = await response.json()
# exluding collections
collections['collections'] = [coll for coll in collections['collections'] if coll['id'] not in exclude_collections]
logging.debug(collections)

tasks = [
@@ -76,12 +87,12 @@ def _generate_docs(self, base_url:str, collections: List[dict]) -> List[Document
"extent": str(doc["extent"])}) for doc in collections]
return docs

async def get_collections_and_generate_docs(self, url) -> Document:
collections = await self._get_collections(url)
docs = self._generate_docs(url, collections)
async def get_collections_and_generate_docs(self, instance) -> Document:
collections = await self._get_collections(instance)
docs = self._generate_docs(instance["url"], collections)
return docs

async def get_docs_for_all_instances(self) -> List[Document]:
tasks = [self.get_collections_and_generate_docs(url) for url in self.urls]
tasks = [self.get_collections_and_generate_docs(instance) for instance in self.instances]
all_docs = await asyncio.gather(*tasks)
return all_docs[0]

0 comments on commit c992e3a

Please sign in to comment.