diff --git a/.gitignore b/.gitignore index a6369733..0026d890 100644 --- a/.gitignore +++ b/.gitignore @@ -118,4 +118,9 @@ examples/energy_wizard/pdfs/ examples/energy_wizard/embed/ examples/energy_wizard/txt/ examples/energy_wizard/meta.csv +examples/research_hub/pdfs/ +examples/research_hub/embed/ +examples/research_hub/txt/ +examples/research_hub/meta.csv *ignore*.py + diff --git a/elm/__init__.py b/elm/__init__.py index 366c3684..b35e3a03 100644 --- a/elm/__init__.py +++ b/elm/__init__.py @@ -11,7 +11,7 @@ from elm.summary import Summary from elm.tree import DecisionTree from elm.wizard import EnergyWizard -from elm.osti import OstiRecord, OstiList +from elm.web.osti import OstiRecord, OstiList __author__ = """Grant Buster""" __email__ = "Grant.Buster@nrel.gov" diff --git a/elm/osti.py b/elm/web/osti.py similarity index 100% rename from elm/osti.py rename to elm/web/osti.py diff --git a/elm/web/rhub.py b/elm/web/rhub.py new file mode 100644 index 00000000..7efd89fc --- /dev/null +++ b/elm/web/rhub.py @@ -0,0 +1,629 @@ +""" +Code to build Corpus from the researcher hub. +""" +import os +import os.path +import logging +from urllib.request import urlopen +import requests +import pandas as pd +from bs4 import BeautifulSoup + +logger = logging.getLogger(__name__) + + +class ResearchOutputs(): + """Class to handle publications portion of the NREL researcher hub.""" + BASE_URL = "https://research-hub.nrel.gov/en/publications/?page=0" + + def __init__(self, url, n_pages=1, txt_dir='./ew_txt'): + """ + Parameters + ---------- + url : str + Research hub publications URL, most likely + https://research-hub.nrel.gov/en/publications/ + n_pages : int + Number of pages to get from the API. Typical response has 50 + entries per page. Default of 1 ensures that this class doesnt hang + on a million responses. + txt_dir : str + File directory where you would like to save output .txt files. + """ + + self.text_dir = txt_dir + self.all_links = [] + for p in range(0, n_pages): + url = url + f"?page={p}" + html = self.html_response(url) + self.soup = BeautifulSoup(html, "html.parser") + + self.target = self.soup.find('ul', {'class': 'list-results'}) + self.docs = self.target.find_all('a', {'class': 'link'}) + + page_links = [d['href'] for d in self.docs if + '/publications/' in d['href']] + self.all_links.extend(page_links) + + def html_response(self, url): + """Function to retrieve html response. + + Parameters + ---------- + url : str + URL of interest. + + Returns + ------- + html : str + HTML response output. + """ + with urlopen(url) as page: + html = page.read().decode("utf-8") + + return html + + def _scrape_authors(self, soup_inst): + """Scrape the names of authors associated with given publication. + + Parameters + ---------- + soup_inst : bs4.BeautifulSoup + Instantiated beautiful soup instance for the url associated with a + given publication. + + Returns + ------- + authors : list + List of all authors (strings) that contributed to publication. + """ + + authors = soup_inst.find('p', {'class': 'relations persons'}).text + + return authors + + def _scrape_links(self, soup_inst): + """Scrape the links under 'Access to Document' header + for a publication. + + Parameters + ---------- + soup_inst : bs4.BeautifulSoup + Instantiated beautiful soup instance for the url associated with a + given publication. + + Returns + ------- + doi link : str + DOI link for a reference if it exists. + pdf link : str + PDF link for a reference if it exists + """ + + doi_target = soup_inst.find('ul', {'class': 'dois'}) + if doi_target: + doi = doi_target.find('a')['href'] + else: + doi = '' + + pdf_target = soup_inst.find('ul', {'class': 'links'}) + if pdf_target: + pdf = pdf_target.find('a')['href'] + else: + pdf = '' + + return doi, pdf + + def _scrape_category(self, soup_inst): + """Scrape the category (ex: Technical Report, Journal Article, etc) + for a given publication. + + Parameters + ---------- + soup_inst : bs4.BeautifulSoup + Instantiated beautiful soup instance for the url associated with a + given publication. + + Returns + ------- + category : str + Publication category for a given record. + """ + + try: + category = soup_inst.find('span', + {'class': + 'type_classification'}).text + except AttributeError: + category = soup_inst.find('span', + {'class': + 'type_classification_parent'}).text + + return category + + def _scrape_year(self, soup_inst): + """Scrape publication year for a given publication. + + Parameters + ---------- + soup_inst : bs4.BeautifulSoup + Instantiated beautiful soup instance for the url associated with a + given publication. + + Returns + ------- + year : str + The year a record was published. + """ + year = soup_inst.find('span', {'class': 'date'}).text + + return year + + def _scrape_id(self, soup_inst): + """Scrape the NREL Publication Number for a given publication. + + Parameters + ---------- + soup_inst : bs4.BeautifulSoup + Instantiated beautiful soup instance for the url associated with a + given publication. + + Returns + ------- + NREL Publication Number: str + Publication number for a record, unique identifier. + """ + + nrel_id = soup_inst.find('ul', {'class': 'relations keywords'}).text + + return nrel_id + + def build_meta(self): + """Build a meta dataframe containing relevant information + for publications. + + Returns + ------- + publications_meta : pd.DataFrame + Dataframe containing metadata for publications. + """ + publications_meta = pd.DataFrame(columns=('title', 'nrel_id', + 'authors', 'year', + 'url', 'doi', + 'pdf_url', 'category')) + for link in self.all_links[:20]: # quantity control here # + with urlopen(link) as page: + html = page.read().decode("utf-8") + meta_soup = BeautifulSoup(html, "html.parser") + + title = meta_soup.find('h1').text + nrel_id = self._scrape_id(meta_soup) + authors = self._scrape_authors(meta_soup) + doi = self._scrape_links(meta_soup)[0] + pdf_url = self._scrape_links(meta_soup)[1] + category = self._scrape_category(meta_soup) + year = self._scrape_year(meta_soup) + + new_row = {'title': title, + 'nrel_id': nrel_id, + 'year': year, + 'authors': authors, + 'url': link, + 'doi': doi, + 'pdf_url': pdf_url, + 'category': category + } + + publications_meta.loc[len(publications_meta)] = new_row + + return publications_meta + + def download_pdf(self, pdf_dir, txt_dir, soup_inst): + """Downloads a pdf for a given link + + Parameters + ---------- + out_dir: str + Directory where the .pdf files should be saved. + soup_inst : bs4.BeautifulSoup + Instantiated beautiful soup instance used to locate pdf url. + """ + pdf_target = soup_inst.find('ul', {'class': 'links'}) + if pdf_target: + pdf_url = pdf_target.find('a')['href'] + + fn = os.path.basename(pdf_url) + fp_out = os.path.join(pdf_dir, fn) + + if pdf_url and pdf_url.endswith('.pdf'): + if not os.path.exists(fp_out): + session = requests.Session() + response = session.get(pdf_url) + with open(fp_out, 'wb') as f_pdf: + f_pdf.write(response.content) + logger.info('Downloaded {}'.format(fn)) + else: + logger.info('{} has already been downloaded'.format(fn)) + elif not pdf_url.endswith('.pdf'): + parent_url = soup_inst.find(property="og:url")['content'] + fn = os.path.basename(parent_url) + '_abstract.txt' + logger.info('No PDF file for {}. Processing abstract.'.format(fn)) + self.scrape_abstract(txt_dir, fn, soup_inst) + + def scrape_abstract(self, out_dir, fn, soup_inst): + """Scrapes abstract for a provided publication + + Parameters + ---------- + out_dir: str + Directory where the .txt files should be saved. + fn: str + File name for saving the file. + soup_inst : bs4.BeautifulSoup + Instantiated beautiful soup instance used for scraping. + """ + out_fp = os.path.join(out_dir, fn) + if not os.path.exists(out_fp): + title = soup_inst.find('h1').text + target = soup_inst.find('h2', string='Abstract') + if target: + abstract = target.find_next_siblings()[0].text + full_txt = (f'The report titled {title} can be ' + f'summarized as follows: {abstract}') + with open(out_fp, "w") as text_file: + text_file.write(full_txt) + else: + logger.info('Abstract not found for {}'.format(fn)) + else: + logger.info('{} has already been processed.'.format(out_fp)) + + def scrape_publications(self, pdf_dir, txt_dir): + """Downloads pdfs for all Technical Reports and scrapes abstracts + for all other publications listed. + + Parameters + ---------- + pdf_dir: str + Directory where the .pdf files should be saved. + txt_dir: str + Directory where the .txt files should be saved. + """ + + os.makedirs(pdf_dir, exist_ok=True) + os.makedirs(txt_dir, exist_ok=True) + url_list = self.all_links[:20] # quantity control here # + + for pub in url_list: + with urlopen(pub) as page: + html = page.read().decode("utf-8") + pubs_soup = BeautifulSoup(html, "html.parser") + + category = self._scrape_category(pubs_soup) + + if category == 'Technical Report': + self.download_pdf(pdf_dir, txt_dir, pubs_soup) + else: + fn = os.path.basename(pub) + '_abstract.txt' + self.scrape_abstract(txt_dir, fn, pubs_soup) + + return logger.info('Finished processing publications') + + +class ResearcherProfiles(): + """ + Class to handle researcher profiles portion of the NREL researcher hub. + """ + BASE_URL = "https://research-hub.nrel.gov/en/persons/?page=0" + + def __init__(self, url, n_pages=1, txt_dir='./ew_txt'): + """ + Parameters + ---------- + url : str + Research hub profiles URL, most likely + https://research-hub.nrel.gov/en/persons/ + n_pages : int + Number of pages to get from the API. Typical response has 50 + entries per page. Default of 1 ensures that this class doesnt hang + on a million responses. + txt_dir : str + File directory where you would like to save output .txt files. + """ + + self.text_dir = txt_dir + self.profile_links = [] + for p in range(0, n_pages): + url_base = url + f"?page={p}" + with urlopen(url_base) as page: + html = page.read().decode("utf-8") + soup = BeautifulSoup(html, "html.parser") + + target = soup.find('ul', {'class': 'grid-results'}) + docs = target.find_all('a', {'class': 'link'}) + + page_links = [d['href'] for d in docs if '/persons/' in d['href']] + self.profile_links.extend(page_links) + + def build_meta(self): + """Build a meta dataframe containing relevant information for + researchers. + + Returns + ------- + profiles_meta : pd.DataFrame + Dataframe containing metadata for researcher profiles. + """ + url_list = self.profile_links + profiles_meta = pd.DataFrame(columns=('title', 'nrel_id', + 'email', 'url', 'fn', + 'category' + )) + for link in url_list[:20]: # quantity control here # + with urlopen(link) as page: + html = page.read().decode("utf-8") + meta_soup = BeautifulSoup(html, "html.parser") + + title = meta_soup.find('h1').text + email_target = meta_soup.find('a', {'class': 'email'}) + if email_target: + email = meta_soup.find('a', + {'class': 'email'} + ).text.replace('nrelgov', '@nrel.gov') + else: + email = '' + id = os.path.basename(link) + fn = os.path.basename(link) + '.txt' + + new_row = {'title': title, + 'nrel_id': id, + 'email': email, + 'url': link, + 'fn': fn, + 'category': 'Researcher Profile' + } + + profiles_meta.loc[len(profiles_meta)] = new_row + + return profiles_meta + + def _scrape_title(self, soup_inst): + """Scrapes name and position for each researcher. + + Parameters + ---------- + soup_inst : bs4.BeautifulSoup + Instantiated beautiful soup instance for the url associated with a + given researcher. + + Returns + ------- + intro : str + String containing researchers name and position. + """ + + r = soup_inst.find('h1').text + + if soup_inst.find('span', {'class': 'job-title'}): + j = soup_inst.find('span', {'class': 'job-title'}).text + intro = (f'The following is brief biography for {r} ' + f'who is a {j} at the National Renewable Energy ' + f'Laboratory:\n') + else: + intro = (f'The following is brief biography for {r}' + f'who works for the National Renewable Energy ' + f'Laboratory:\n') + + return intro + + def _scrape_bio(self, soup_inst): + """Scrapes 'Personal Profile' section for each researcher. + + Parameters + ---------- + soup_inst : bs4.BeautifulSoup + Instantiated beautiful soup instance for the url associated with a + given researcher. + + Returns + ------- + bio : str + String containing background text from profile. + """ + target = soup_inst.find('h3', string="Personal Profile") + + bio = '' + if target: + for sib in target.find_next_siblings(): + if sib.name == "h3": + break + bio = bio + sib.text + + return bio + + def _scrape_lists(self, soup_inst, heading): + """Scrapes sections such as 'Professional Experience' and + 'Research Interests' + + Parameters + ---------- + soup_inst : bs4.BeautifulSoup + Instantiated beautiful soup instance for the url associated with a + given researcher. + heading: str + Section to scrape. Should be 'Professional Experience' or + 'Research Interests' + + Returns + ------- + text : str + String containing contents from the experience section. + """ + r = soup_inst.find('h1').text + target = soup_inst.find('h3', string=heading) + + exp_list = [] + + if target: + for sib in target.find_next_siblings(): + exp_list.append(sib.text) + + exp = ', '.join(exp_list) + + text = f"{r}'s {heading} includes the following:\n{exp} " + else: + text = '' + + return text + + def _scrape_education(self, soup_inst): + """Scrapes and reformats 'Education/Academic Qualification' + section for each researcher. + + Parameters + ---------- + soup_inst : bs4.BeautifulSoup + Instantiated beautiful soup instance for the url associated with a + given researcher. + + Returns + ------- + full_text : str + String containing researcher's education (level, focus, + and institution). + """ + r = soup_inst.find('h1').text + target = soup_inst.find('h3', + string='Education/Academic Qualification') + + full_text = '' + if target: + for sib in target.find_next_siblings(): + t = sib.text + if len(t.split(',')) >= 3: + level = t.split(',')[0] + deg = t.split(',')[1] + inst = ','.join(t.split(',')[2:]) + + text = (f"{r} received a {level} degree in {deg} " + f"from the {inst}. ") + elif len(t.split(',')) == 2: + level = t.split(',')[0] + inst = t.split(',')[1] + + text = f"{r} received a {level} degree from the {inst}. " + + full_text = full_text + text + + return full_text + + def _scrape_publications(self, profile_url): + """Scrapes the name of each publication that a + researcher contributed to. + + Parameters + ---------- + profile_url : str + Link to a specific researchers profile. + + Returns + ------- + text : str + String containing names of all publications for a given researcher. + """ + pubs_url = profile_url + '/publications/' + with urlopen(pubs_url) as page: + html = page.read().decode("utf-8") + pubs_soup = BeautifulSoup(html, "html.parser") + + r = pubs_soup.find('h1').text + target = pubs_soup.find_all('h3', {'class': 'title'}) + + pubs = [] + if target: + for p in target: + pubs.append(p.text) + + pubs = ', '.join(pubs) + text = (f'{r} has contributed to the following ' + f'publications: {pubs}.') + else: + text = '' + + return text + + def _scrape_similar(self, profile_url): + """Scrapes the names listed under the 'Similar Profiles' section. + + Parameters + ---------- + profile_url : str + Link to a specific researchers profile. + + Returns + ------- + text : str + String containing names of similar researchers. + """ + sim_url = profile_url + '/similar/' + with urlopen(sim_url) as sim_page: + sim_html = sim_page.read().decode("utf-8") + sim_soup = BeautifulSoup(sim_html, "html.parser") + + r = sim_soup.find('h1').text + target = sim_soup.find_all('h3', {'class': 'title'}) + + similar = [] + if target: + for p in target: + similar.append(p.text) + + similar = ', '.join(similar) + text = f'{r} has worked on projects with {similar}.' + else: + text = '' + + return text + + def scrape_profiles(self, out_dir): + """Scrapes profiles for each researcher. + + Parameters + ---------- + out_dir: str + Directory where the .txt files should be saved. + """ + os.makedirs(out_dir, exist_ok=True) + url_list = self.profile_links[:20] # quantity control here # + + for i, prof in enumerate(url_list): + f = os.path.basename(prof) + '.txt' + txt_fp = os.path.join(out_dir, f) + if not os.path.exists(txt_fp): + with urlopen(prof) as page: + html = page.read().decode("utf-8") + prof_soup = BeautifulSoup(html, "html.parser") + + r = prof_soup.find('h1').text + + intro = self._scrape_title(prof_soup) + bio = self._scrape_bio(prof_soup) + exp = self._scrape_lists(prof_soup, 'Professional Experience') + interests = self._scrape_lists(prof_soup, 'Research Interests') + edu = self._scrape_education(prof_soup) + pubs = self._scrape_publications(prof) + similar = self._scrape_similar(prof) + + full_txt = (intro + bio + '\n' + exp + '\n' + + interests + '\n' + edu + '\n' + + pubs + '\n' + similar) + + with open(txt_fp, "w") as text_file: + text_file.write(full_txt) + logger.info('Profile {}/{}: {} saved to ' + '{}'.format(i + 1, len(url_list), + r, txt_fp)) + + else: + logger.info('Profile {}/{} already ' + 'exists.'.format(i + 1, len(url_list))) + return logger.info('Finished processing profiles') diff --git a/elm/wizard.py b/elm/wizard.py index 0431005c..c684f857 100644 --- a/elm/wizard.py +++ b/elm/wizard.py @@ -165,6 +165,7 @@ def engineer_query(self, query, token_budget=None, new_info_threshold=0.7, # [1:] to not include the system role in the semantic search query = [f"{msg['role'].upper()}: {msg['content']}" for msg in self.messages[1:]] + query = '\n\n'.join(query) token_budget = token_budget or self.token_budget diff --git a/examples/research_hub/README.rst b/examples/research_hub/README.rst new file mode 100644 index 00000000..4f430ea0 --- /dev/null +++ b/examples/research_hub/README.rst @@ -0,0 +1,28 @@ +******************************** +The Energy Wizard - Research Hub +******************************** + +This example demonstrates how to scrape publication abstracts and researcher profiles, +chunk, embed, and then run a streamlit app that interfaces an LLM with the text +corpus. It is intended for use with the [NREL Research Hub](https://research-hub.nrel.gov/) only. + +Notes: + +- Currently this example is only set up to include 10 researchers and 10 publications + +- Streamlit is required to run this app, which is not an explicit requirement of this repo (``pip install streamlit``) + +- You need to set up your own OpenAI or Azure-OpenAI API keys to run the scripts. + +Scraping and Embedding +============================== + +Run ``python ./retrieve_docs.py``to scrape research-hub.nrel.gov for both profiles and publications. The script then runs the +text through the OpenAI embedding model. + +Running the Streamlit App +========================= + +Run ``streamlit run ./run_app.py`` to start the streamlit app. You can now chat +with the Energy Wizard, which will interface with the downloaded text corpus to +answer your questions. diff --git a/examples/research_hub/retrieve_docs.py b/examples/research_hub/retrieve_docs.py new file mode 100644 index 00000000..d48d459d --- /dev/null +++ b/examples/research_hub/retrieve_docs.py @@ -0,0 +1,133 @@ +""" +Code to build Corpus from the researcher hub. +""" +import os +import os.path +import asyncio +from glob import glob +import logging +import time +import pandas as pd +import openai +from rex import init_logger + + +from elm.pdf import PDFtoTXT +from elm.embed import ChunkAndEmbed +from elm.web.rhub import ResearcherProfiles +from elm.web.rhub import ResearchOutputs + +# initialize logger +logger = logging.getLogger(__name__) +init_logger(__name__, log_level='DEBUG') +init_logger('elm', log_level='INFO') + +# set openAI variables +openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT") +openai.api_key = os.getenv("AZURE_OPENAI_API_KEY") +openai.api_type = 'azure' +openai.api_version = '2023-03-15-preview' + +ChunkAndEmbed.EMBEDDING_MODEL = 'text-embedding-ada-002-2' +ChunkAndEmbed.EMBEDDING_URL = ('https://stratus-embeddings-south-central.' + 'openai.azure.com/openai/deployments/' + 'text-embedding-ada-002-2/embeddings?' + f'api-version={openai.api_version}') +ChunkAndEmbed.HEADERS = {"Content-Type": "application/json", + "Authorization": f"Bearer {openai.api_key}", + "api-key": f"{openai.api_key}"} + +PDF_DIR = './pdfs/' +TXT_DIR = './txt/' +EMBED_DIR = './embed/' + +if __name__ == '__main__': + os.makedirs(PDF_DIR, exist_ok=True) + os.makedirs(TXT_DIR, exist_ok=True) + os.makedirs(EMBED_DIR, exist_ok=True) + + rp = ResearcherProfiles('https://research-hub.nrel.gov/en/persons/') + pubs = ResearchOutputs('https://research-hub.nrel.gov/en/publications/') + + rp.scrape_profiles(TXT_DIR) + pubs.scrape_publications(PDF_DIR, TXT_DIR) + + profiles_meta = rp.build_meta() + pubs_meta = pubs.build_meta() + + pubs_meta['fn'] = pubs_meta.apply(lambda row: + os.path.basename(row['pdf_url']) + if row['category'] == 'Technical Report' + and row['pdf_url'].endswith('.pdf') + else os.path.basename(row['url']) + + '_abstract.txt', axis=1) + pubs_meta['fp'] = pubs_meta.apply(lambda row: + PDF_DIR + row['fn'] + if row['category'] == 'Technical Report' + and row['pdf_url'].endswith('.pdf') + else TXT_DIR + row['fn'], axis=1) + + profiles_meta['fp'] = TXT_DIR + profiles_meta['fn'] + + meta = pd.concat([profiles_meta, pubs_meta], axis=0, ignore_index=True) + meta = meta.drop_duplicates(subset=['nrel_id']) + meta.to_csv('./meta.csv', index=False) + + logger.info('Meta file saved to {}/meta.csv'.format(os.getcwd())) + + missing = ~meta['fp'].apply(os.path.exists) + meta = meta[~missing] + + for i, row in meta.iterrows(): + fp = row['fp'] + txt_fp = os.path.join(TXT_DIR, row['fn'].replace('.pdf', '.txt')) + embed_fp = os.path.join(EMBED_DIR, + row['fn'].replace('.pdf', '.json') + .replace('.txt', '.json')) + + assert os.path.exists(fp), f'{fp} does not exist' + + if os.path.exists(txt_fp): + logger.info(f'Opening:{txt_fp}') + with open(txt_fp, 'r') as f: + text = f.read() + + else: + pdf_obj = PDFtoTXT(fp) + text = pdf_obj.clean_poppler(layout=True) + if pdf_obj.is_double_col(): + text = pdf_obj.clean_poppler(layout=False) + text = pdf_obj.clean_headers(char_thresh=0.6, page_thresh=0.8, + split_on='\n', + iheaders=[0, 1, 3, -3, -2, -1]) + with open(txt_fp, 'w') as f: + f.write(text) + logger.info(f'Saved: {txt_fp}') + + assert os.path.exists(txt_fp) + + if not os.path.exists(embed_fp): + logger.info('Embedding {}/{}: "{}"' + .format(i + 1, len(meta), row['title'])) + tag = f"Title: {row['title']}\nAuthors: {row['authors']}" + obj = ChunkAndEmbed(text, tag=tag, tokens_per_chunk=500, overlap=1) + embeddings = asyncio.run(obj.run_async(rate_limit=3e4)) + if any(e is None for e in embeddings): + raise RuntimeError('Embeddings are None!') + else: + df = pd.DataFrame({'text': obj.text_chunks.chunks, + 'embedding': embeddings, + 'nrel_id': row['nrel_id']}) + df.to_json(embed_fp, indent=2) + logger.info('Saved: {}'.format(embed_fp)) + time.sleep(5) + + bad = [] + fps = glob(EMBED_DIR + '*.json') + for fp in fps: + data = pd.read_json(fp) + if data['embedding'].isna().any(): + bad.append(fp) + assert not any(bad), f'Bad output: {bad}' + + logger.info('Finished!') diff --git a/examples/research_hub/run_app.py b/examples/research_hub/run_app.py new file mode 100644 index 00000000..686e5bc5 --- /dev/null +++ b/examples/research_hub/run_app.py @@ -0,0 +1,131 @@ +"""This script launches a streamlit app for the Energy Wizard""" +import streamlit as st +import os +import openai +from glob import glob +import pandas as pd +import sys + +from elm import EnergyWizard + + +model = 'gpt-4' + +# NREL-Azure endpoint. You can also use just the openai endpoint. +# NOTE: embedding values are different between OpenAI and Azure models! +openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT") +openai.api_key = os.getenv("AZURE_OPENAI_KEY") +openai.api_type = 'azure' +openai.api_version = os.getenv('AZURE_OPENAI_VERSION') + +EnergyWizard.EMBEDDING_MODEL = 'text-embedding-ada-002-2' +EnergyWizard.EMBEDDING_URL = ('https://stratus-embeddings-south-central.' + 'openai.azure.com/openai/deployments/' + 'text-embedding-ada-002-2/embeddings?' + f'api-version={openai.api_version}') +EnergyWizard.URL = ('https://stratus-embeddings-south-central.' + 'openai.azure.com/openai/deployments/' + f'{model}/chat/completions?' + f'api-version={openai.api_version}') +EnergyWizard.HEADERS = {"Content-Type": "application/json", + "Authorization": f"Bearer {openai.api_key}", + "api-key": f"{openai.api_key}"} + +EnergyWizard.MODEL_ROLE = ('You are a energy research assistant. Use the ' + 'articles below to answer the question. If ' + 'articles do not provide enough information to ' + 'answer the question, say "I do not know."') +EnergyWizard.MODEL_INSTRUCTION = EnergyWizard.MODEL_ROLE + + +@st.cache_data +def get_corpus(): + """Get the corpus of text data with embeddings.""" + corpus = sorted(glob('./embed/*.json')) + corpus = [pd.read_json(fp) for fp in corpus] + corpus = pd.concat(corpus, ignore_index=True) + meta = pd.read_csv('./meta.csv') + + corpus['nrel_id'] = corpus['nrel_id'].astype(str) + meta['nrel_id'] = meta['nrel_id'].astype(str) + corpus = corpus.set_index('nrel_id') + meta = meta.set_index('nrel_id') + + corpus = corpus.join(meta, on='nrel_id', rsuffix='_record', how='left') + + ref = [f"{row['title']} ({row['url']})" for _, row in corpus.iterrows()] + corpus['ref'] = ref + + return corpus + + +@st.cache_resource +def get_wizard(): + """Get the energy wizard object.""" + + # Getting Corpus of data. If no corpus throw error for user. + try: + corpus = get_corpus() + except Exception: + print("Error: Have you run 'retrieve_docs.py'?") + st.header("Error") + st.write("Error: Have you run 'retrieve_docs.py'?") + sys.exit(0) + + wizard = EnergyWizard(corpus, ref_col='ref', model=model) + return wizard + + +if __name__ == '__main__': + wizard = get_wizard() + + msg = """Hello!\nI am the Energy Wizard - Research Hub edition. I have + access to the NREL Research Hub which includes researcher profiles as + well as NREL Publications.Note that each question you ask is independent. + I am not fully conversational yet like ChatGPT is. Here are some examples + of questions you can ask me: + \n - What is 'insert researcher name' position at NREL? + \n - Which publication has 'researcher name' contributed to? + \n - Can you summarize 'publication name'? + \n - Who has experience researching grid resilience? + \n - Who at NREL has experience with on techno-economic analysis? + """ + + st.title(msg) + + if "messages" not in st.session_state: + st.session_state.messages = [] + + for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + + msg = "Type your question here" + if prompt := st.chat_input(msg): + st.chat_message("user").markdown(prompt) + st.session_state.messages.append({"role": "user", "content": prompt}) + + with st.chat_message("assistant"): + + message_placeholder = st.empty() + full_response = "" + + out = wizard.chat(prompt, + debug=True, stream=True, token_budget=6000, + temperature=0.0, print_references=True, + convo=False, return_chat_obj=True) + references = out[-1] + + for response in out[0]: + full_response += response.choices[0].delta.content or "" + message_placeholder.markdown(full_response + "▌") + + ref_msg = ('\n\nThe wizard was provided with the ' + 'following documents to support its answer:') + ref_msg += '\n - ' + '\n - '.join(references) + full_response += ref_msg + + message_placeholder.markdown(full_response) + + st.session_state.messages.append({"role": "assistant", + "content": full_response}) diff --git a/requirements.txt b/requirements.txt index aca4d724..a270a202 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,4 @@ python-slugify scipy tabulate tiktoken +beautifulsoup4 diff --git a/tests/data/rhub_html.txt b/tests/data/rhub_html.txt new file mode 100644 index 00000000..ffb8daf5 --- /dev/null +++ b/tests/data/rhub_html.txt @@ -0,0 +1,4216 @@ + + + + + + + + + + + + + + + Find NREL Research Outputs + — National Renewable Energy Laboratory + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + + +
+ +
+
+
+
+
+ +

+ Find NREL Research Outputs +

+
+ + + +
+ + +
+ + + +
+
+ + + + +
+
+ +
+
+ +
+ + + + + + + + + + +
+ + Search in All Content + + +

Filters for Research Output

+ + + + + + +
+ + + + +
+
+ + + + +
+
+ + + + +
+
+ + + + +
+
+ + + + +
+
+ + + + +
+
+ + + + +
+
+ + + + +
+
+ + + + +
+
+ + +
+ + + + +
+ "Research Units" +
+ + +
+ +
+ + +
+
+ +
+ +
+
+ + + + + + + + + + + + + +
+ + Search concepts + +
+
    + + + + + + +
+
+ + Selected Filters + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + +
+
+
+ + +
+
+ +
+ + + + +
+ + + + + + + + + +

Search results

+ + + + +
+ + + + + + +
+ + +
+
+ +
+ + + + + + + + + + + + + + + + + + diff --git a/tests/test_osti.py b/tests/web/test_osti.py similarity index 100% rename from tests/test_osti.py rename to tests/web/test_osti.py diff --git a/tests/web/test_rhub.py b/tests/web/test_rhub.py new file mode 100644 index 00000000..c80374c9 --- /dev/null +++ b/tests/web/test_rhub.py @@ -0,0 +1,36 @@ +"""Test research hub html response""" +import os +from elm import TEST_DATA_DIR +from elm.web.rhub import ResearchOutputs +import elm.web.rhub + + +FP_TXT = os.path.join(TEST_DATA_DIR, 'rhub_html.txt') + +with open(FP_TXT, 'r', encoding='utf8') as f: + TEXT = f.read() + + +class MockClass: + """Dummy class to mock ResearchOutputs.html_response()""" + + @staticmethod + def call(*args, **kwargs): # pylint: disable=unused-argument + """Mock for ResearchOutputs.html_response()""" + return TEXT + + +def test_rhub(mocker): + """Test to ensure correct response from research hub.""" + mocker.patch.object(elm.web.rhub.ResearchOutputs, + 'html_response', MockClass.call) + + out = ResearchOutputs("dummy") + + meta = out.build_meta() + + assert len(meta) > 10 + assert 'title' in meta.columns + assert 'url' in meta.columns + assert meta['title'].isna().sum() == 0 + assert meta['url'].isna().sum() == 0