diff --git a/.env.example b/.env.example index 628df87..ea83a48 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,4 @@ -# MongoDB Config -DATABASE_HOST=mongodb://decodingml:decodingml@127.0.0.1:27017 -DATABASE_NAME=twin +# --- Required settings even when working locally. --- # OpenAI API Config OPENAI_MODEL_ID=gpt-4o-mini @@ -9,15 +7,23 @@ OPENAI_API_KEY=str # Huggingface API Config HUGGINGFACE_ACCESS_TOKEN=str -# RAG -RAG_MODEL_DEVICE=cpu +# Comet ML (during training) +COMET_API_KEY=str +COMET_WORKSPACE=llm-engineers-handbook -# AWS Credentials +# --- Required settings when deploying the code. --- +# --- Otherwise, default values values work fine. --- + +# MongoDB database +DATABASE_HOST="mongodb://decodingml:decodingml@127.0.0.1:27017" + +# Qdrant vector database +USE_QDRANT_CLOUD=false +QDRANT_CLOUD_URL="str" +QDRANT_APIKEY="str" + +# AWS Authentication AWS_ARN_ROLE=str +AWS_REGION=eu-central-1 AWS_ACCESS_KEY=str AWS_SECRET_KEY=str -AWS_REGION=eu-central-1 - -# LinkedIn Credentials -LINKEDIN_USERNAME=str -LINKEDIN_PASSWORD=str diff --git a/README.md b/README.md index c759511..681dfbb 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,108 @@ # LLM-Engineering -## Dependencies +Repository that contains all the code used throughout the [LLM Engineer's Handbook](https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/). -- Python 3.11 -- Poetry 1.8.3 -- Docker 26.0.0 +![Book Cover](/images/book_cover.png) -## Install +# Dependencies +## Local dependencies + +To install and run the project locally, you need the following dependencies (the code was tested with the specified versions of the dependencies): + +- [pyenv 2.3.36](https://github.com/pyenv/pyenv) (optional: for installing multiple Python versions on your machine) +- [Python 3.11](https://www.python.org/downloads/) +- [Poetry 1.8.3](https://python-poetry.org/docs/#installation) +- [Docker 27.1.1](https://docs.docker.com/engine/install/) + +## Cloud services + +The code also uses and depends on the following cloud services. For now, you don't have to do anything. We will guide you in the installation and deployment sections on how to use them: + +- [HuggingFace](https://huggingface.com/): Model registry +- [Comet ML](https://www.comet.com/site/): Experiment tracker +- [Opik](https://www.comet.com/site/products/opik/): LLM evaluation and prompt monitoring +- [ZenML](https://www.zenml.io/): Orchestrator +- [AWS](https://aws.amazon.com/): Compute and storage +- [MongoDB](https://www.mongodb.com/): NoSQL database +- [Qdrant](https://qdrant.tech/): Vector database + +In the [LLM Engineer's Handbook](https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/), Chapter 2 will walk you through each tool, and in Chapters 10 and 11, you will have step-by-step guides on how to set everything you need. + +# Install + +## Install Python 3.11 using pyenv (Optional) + +If you have a different global Python version than Python 3.11, you can use pyenv to install Python 3.11 at the project level. Verify your Python version with: +```shell +python --version +``` + +First, verify that you have pyenv installed: +```shell +pyenv --version +# Output: pyenv 2.3.36 +``` + +Install Python 3.11: +```shell +pyenv install 3.11 +``` + +From the root of your repository, run the following to verify that everything works fine: +```shell +pyenv versions +# Output: +# system +# * 3.11.8 (set by /LLM-Engineers-Handbook/.python-version) +``` + +Because we defined a `.python-version` file within the repository, pyenv will know to pick up the version from that file and use it locally whenever you are working within that folder. To double-check that, run the following command while you are in the repository: +```shell +python --version +# Output: Python 3.11.8 +``` + +If you move out of this repository, both `pyenv versions` and `python --version`, might output different Python versions. + +## Install project dependences + +The first step is to verify that you have Poetry installed: +```shell +poetry --version +# Output: Poetry (version 1.8.3) +``` + +Use Poetry to install all the project's requirements to run it locally. Thus, we don't need to install any AWS dependencies. Also, we install Poe the Poet as a Poetry plugin to manage our CLI commands and pre-commit to verify our code before committing changes to git: ```shell poetry install --without aws -poetry self add 'poethepoet[poetry_plugin]' +poetry self add 'poethepoet[poetry_plugin]==0.29.0' pre-commit install ``` -We run all the scripts using [Poe the Poet](https://poethepoet.natn.io/index.html). You don't have to do anything else but install it as a Poetry plugin. +We run all the scripts using [Poe the Poet](https://poethepoet.natn.io/index.html). You don't have to do anything else but install Poe the Poet as a Poetry plugin, as described above: `poetry self add 'poethepoet[poetry_plugin]'` + +To activate the environment created by Poetry, run: +```shell +poetry shell +``` + +## Set up .env settings file (for local development) -### Configure sensitive information -After you have installed all the dependencies, you must create a `.env` file with sensitive credentials to run the project. +After you have installed all the dependencies, you must create and fill a `.env` file with your credentials to properly interact with other services and run the project. First, copy our example by running the following: ```shell -cp .env.example .env # The file has to be at the root of your repository! +cp .env.example .env # The file must be at your repository's root! ``` Now, let's understand how to fill in all the variables inside the `.env` file to get you started. +We will begin by reviewing the mandatory settings we must complete when working locally or in the cloud. + ### OpenAI -To authenticate to OpenAI, you must fill out the `OPENAI_API_KEY` env var with an authentication token. +To authenticate to OpenAI's API, you must fill out the `OPENAI_API_KEY` env var with an authentication token. → Check out this [tutorial](https://platform.openai.com/docs/quickstart) to learn how to provide one from OpenAI. @@ -38,32 +112,53 @@ To authenticate to HuggingFace, you must fill out the `HUGGINGFACE_ACCESS_TOKEN` → Check out this [tutorial](https://huggingface.co/docs/hub/en/security-tokens) to learn how to provide one from HuggingFace. +### Comet ML -### LinkedIn Crawling [Optional] -This step is optional. You can finish the project without this step. +Comet ML is required only during training. -But in case you want to enable LinkedIn crawling, you have to fill in your username and password: -```shell -LINKEDIN_USERNAME = "str" -LINKEDIN_PASSWORD = "str" +To authenticate to Comet ML, you must fill out the `COMET_API_KEY` and `COMET_WORKSPACE` env vars with an authentication token and workspace name. + +→ Check out this [tutorial](https://www.comet.com/docs/v2/api-and-sdk/rest-api/overview/) to learn how to fill the Comet ML variables from above. + +### Opik + +> Soon + + +## Set up .env settings file (for deployment) + +when deploying the project to the cloud, we must set additional settings for Mongo, Qdrant, and AWS. + +If you are just working localy, the default values of these env vars will work out-of-the-box. + +We will just highlight what has to be configured, as in **Chapter 11** of the [LLM Engineer's Handbook](https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/) we provide step-by-step details on how to deploy the whole system to the cloud. + +### MongoDB + +We must change the `DATABASE_HOST` env var with the URL pointing to the cloud MongoDB cluster. + +### Qdrant + +Change `USE_QDRANT_CLOUD` to `True` and `QDRANT_CLOUD_URL` with the URL and `QDRANT_APIKEY` with the API KEY of your cloud Qdrant cluster. + +To work with Qdrant cloud, the env vars will look like this: +```env +USE_QDRANT_CLOUD=true +QDRANT_CLOUD_URL="" +QDRANT_APIKEY="" ``` -For this to work, you also have to: -- disable 2FA -- disable suspicious activity +### AWS -We also recommend to: -- create a dummy profile for crawling -- crawl only your data > [!IMPORTANT] -> Find more configuration options in the [settings.py](https://github.com/PacktPublishing/LLM-Engineering/blob/main/llm_engineering/settings.py) file. Every variable from the `Settings` class can be configured through the `.env` file. +> Find more configuration options in the [settings.py](https://github.com/PacktPublishing/LLM-Engineers-Handbook/blob/main/llm_engineering/settings.py) file. Every variable from the `Settings` class can be configured through the `.env` file. -## Run Locally +# Run the project locally -### Local Infrastructure +## Local infrastructure > [!WARNING] > You need Docker installed (v27.1.1 or higher) @@ -84,7 +179,7 @@ poetry poe local-infrastructure-down > `export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES` > Otherwise, the connection between the local server and pipeline will break. 🔗 More details in [this issue](https://github.com/zenml-io/zenml/issues/2369). -#### ZenML is now accessible at: +### ZenML is now accessible at: Web UI: [localhost:8237](localhost:8237) @@ -94,7 +189,7 @@ Default credentials: →🔗 [More on ZenML](https://docs.zenml.io/) -#### Qdrant is now accessible at: +### Qdrant is now accessible at: REST API: [localhost:6333](localhost:6333) Web UI: [localhost:6333/dashboard](localhost:6333/dashboard) @@ -102,7 +197,7 @@ GRPC API: [localhost:6334](localhost:6334) →🔗 [More on Qdrant](https://qdrant.tech/documentation/quick-start/) -#### MongoDB is now accessible at: +### MongoDB is now accessible at: database URI: `mongodb://decodingml:decodingml@127.0.0.1:27017` database name: `twin` @@ -113,7 +208,7 @@ database name: `twin` We will fill this section in the future. So far it is available only in the 11th Chapter of the book. -### Run Pipelines +## Run Pipelines All the pipelines will be orchestrated behind the scenes by ZenML. @@ -126,7 +221,7 @@ To see the pipelines running and their results: **But first, let's understand how we can run all our ML pipelines** ↓ -#### Data pipelines +### Data pipelines Run the data collection ETL: ```shell @@ -155,14 +250,14 @@ poetry poe run-end-to-end-data-pipeline ``` -#### Utility pipelines +### Utility pipelines Export ZenML artifacts to JSON: ```shell poetry poe run-export-artifact-to-json-pipeline ``` -#### Training pipelines +### Training pipelines ```shell poetry poe run-training-pipeline diff --git a/images/book_cover.png b/images/book_cover.png new file mode 100644 index 0000000..c345a40 Binary files /dev/null and b/images/book_cover.png differ diff --git a/llm_engineering/application/crawlers/linkedin.py b/llm_engineering/application/crawlers/linkedin.py index 88b90ff..d717dcc 100644 --- a/llm_engineering/application/crawlers/linkedin.py +++ b/llm_engineering/application/crawlers/linkedin.py @@ -16,10 +16,36 @@ class LinkedInCrawler(BaseSeleniumCrawler): model = PostDocument + def __init__(self, scroll_limit: int = 5, is_deprecated: bool = True) -> None: + super().__init__(scroll_limit) + + self._is_deprecated = is_deprecated + def set_extra_driver_options(self, options) -> None: options.add_experimental_option("detach", True) + def login(self) -> None: + if self._is_deprecated: + raise DeprecationWarning( + "As LinkedIn has updated its security measures, the login() method is no longer supported." + ) + + self.driver.get("https://www.linkedin.com/login") + if not settings.LINKEDIN_USERNAME or not settings.LINKEDIN_PASSWORD: + raise ImproperlyConfigured( + "LinkedIn scraper requires the {LINKEDIN_USERNAME} and {LINKEDIN_PASSWORD} settings." + ) + + self.driver.find_element(By.ID, "username").send_keys(settings.LINKEDIN_USERNAME) + self.driver.find_element(By.ID, "password").send_keys(settings.LINKEDIN_PASSWORD) + self.driver.find_element(By.CSS_SELECTOR, ".login__form_action_container button").click() + def extract(self, link: str, **kwargs) -> None: + if self._is_deprecated: + raise DeprecationWarning( + "As LinkedIn has updated its feed structure, the extract() method is no longer supported." + ) + if self.model.link is not None: old_model = self.model.find(link=link) if old_model is not None: @@ -76,7 +102,9 @@ def extract(self, link: str, **kwargs) -> None: def _scrape_section(self, soup: BeautifulSoup, *args, **kwargs) -> str: """Scrape a specific section of the LinkedIn profile.""" # Example: Scrape the 'About' section + parent_div = soup.find(*args, **kwargs) + return parent_div.get_text(strip=True) if parent_div else "" def _extract_image_urls(self, buttons: List[Tag]) -> Dict[str, str]: @@ -89,6 +117,7 @@ def _extract_image_urls(self, buttons: List[Tag]) -> Dict[str, str]: Returns: Dict[str, str]: A dictionary mapping post indexes to image URLs. """ + post_images = {} for i, button in enumerate(buttons): img_tag = button.find("img") @@ -100,8 +129,10 @@ def _extract_image_urls(self, buttons: List[Tag]) -> Dict[str, str]: def _get_page_content(self, url: str) -> BeautifulSoup: """Retrieve the page content of a given URL.""" + self.driver.get(url) time.sleep(5) + return BeautifulSoup(self.driver.page_source, "html.parser") def _extract_posts(self, post_elements: List[Tag], post_images: Dict[str, str]) -> Dict[str, Dict[str, str]]: @@ -115,6 +146,7 @@ def _extract_posts(self, post_elements: List[Tag], post_images: Dict[str, str]) Returns: Dict[str, Dict[str, str]]: A dictionary containing post data with text and optional image URL. """ + posts_data = {} for i, post_element in enumerate(post_elements): post_text = post_element.get_text(strip=True, separator="\n") @@ -122,14 +154,17 @@ def _extract_posts(self, post_elements: List[Tag], post_images: Dict[str, str]) if f"Post_{i}" in post_images: post_data["image"] = post_images[f"Post_{i}"] posts_data[f"Post_{i}"] = post_data + return posts_data def _scrape_experience(self, profile_url: str) -> str: """Scrapes the Experience section of the LinkedIn profile.""" + self.driver.get(profile_url + "/details/experience/") time.sleep(5) soup = BeautifulSoup(self.driver.page_source, "html.parser") experience_content = soup.find("section", {"id": "experience-section"}) + return experience_content.get_text(strip=True) if experience_content else "" def _scrape_education(self, profile_url: str) -> str: @@ -137,16 +172,5 @@ def _scrape_education(self, profile_url: str) -> str: time.sleep(5) soup = BeautifulSoup(self.driver.page_source, "html.parser") education_content = soup.find("section", {"id": "education-section"}) - return education_content.get_text(strip=True) if education_content else "" - - def login(self): - """Log in to LinkedIn.""" - self.driver.get("https://www.linkedin.com/login") - if not settings.LINKEDIN_USERNAME or not settings.LINKEDIN_PASSWORD: - raise ImproperlyConfigured( - "LinkedIn scraper requires the {LINKEDIN_USERNAME} and {LINKEDIN_PASSWORD} settings." - ) - self.driver.find_element(By.ID, "username").send_keys(settings.LINKEDIN_USERNAME) - self.driver.find_element(By.ID, "password").send_keys(settings.LINKEDIN_PASSWORD) - self.driver.find_element(By.CSS_SELECTOR, ".login__form_action_container button").click() + return education_content.get_text(strip=True) if education_content else "" diff --git a/llm_engineering/settings.py b/llm_engineering/settings.py index 66f6ccf..5a4a3d2 100644 --- a/llm_engineering/settings.py +++ b/llm_engineering/settings.py @@ -7,41 +7,43 @@ class Settings(BaseSettings): model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8") - # MongoDB NoSQL Database - DATABASE_HOST: str = "mongodb://decodingml:decodingml@127.0.0.1:27017" - DATABASE_NAME: str = "twin" + # --- Required settings even when working locally. --- - # LinkedIn Credentials - LINKEDIN_USERNAME: str | None = None - LINKEDIN_PASSWORD: str | None = None + # OpenAI API + OPENAI_MODEL_ID: str = "gpt-4o-mini" + OPENAI_API_KEY: str | None = None - # RAG - TEXT_EMBEDDING_MODEL_ID: str = "sentence-transformers/all-MiniLM-L6-v2" - RERANKING_CROSS_ENCODER_MODEL_ID: str = "cross-encoder/ms-marco-MiniLM-L-4-v2" - RAG_MODEL_DEVICE: str = "cpu" + # Huggingface API + HUGGINGFACE_ACCESS_TOKEN: str | None = None - # QdrantDB Vector DB - USE_QDRANT_CLOUD: bool = False + # Comet ML (during training) + COMET_API_KEY: str | None = None + COMET_WORKSPACE: str = "llm-engineers-handbook" + COMET_PROJECT: str = "twin" + + # --- Required settings when deploying the code. --- + # --- Otherwise, default values values work fine. --- + # MongoDB database + DATABASE_HOST: str = "mongodb://decodingml:decodingml@127.0.0.1:27017" + DATABASE_NAME: str = "twin" + + # Qdrant vector database + USE_QDRANT_CLOUD: bool = False QDRANT_DATABASE_HOST: str = "localhost" QDRANT_DATABASE_PORT: int = 6333 - QDRANT_CLOUD_URL: str = "str" QDRANT_APIKEY: str | None = None - # OpenAI API - OPENAI_MODEL_ID: str = "gpt-4o-mini" - OPENAI_API_KEY: str | None = None - - # Huggingface API - HUGGINGFACE_ACCESS_TOKEN: str | None = None + # AWS Authentication + AWS_ARN_ROLE: str | None = None + AWS_REGION: str = "eu-central-1" + AWS_ACCESS_KEY: str | None = None + AWS_SECRET_KEY: str | None = None - # CometML config - COMET_API_KEY: str | None = None - COMET_WORKSPACE: str | None = None - COMET_PROJECT: str | None = None + # --- Optional settings used to tweak the code. --- - # SageMaker + # AWS SageMaker HF_MODEL_ID: str = "mlabonne/TwinLlama-3.1-8B-13" # or use "crumb/nano-mistral" for a quick test with a small model GPU_INSTANCE_TYPE: str = "ml.g5.xlarge" SM_NUM_GPUS: int = 1 @@ -58,11 +60,14 @@ class Settings(BaseSettings): TOP_P_INFERENCE: float = 0.9 MAX_NEW_TOKENS_INFERENCE: int = 150 - # AWS - AWS_ARN_ROLE: str | None = None - AWS_REGION: str = "eu-central-1" - AWS_ACCESS_KEY: str | None = None - AWS_SECRET_KEY: str | None = None + # RAG + TEXT_EMBEDDING_MODEL_ID: str = "sentence-transformers/all-MiniLM-L6-v2" + RERANKING_CROSS_ENCODER_MODEL_ID: str = "cross-encoder/ms-marco-MiniLM-L-4-v2" + RAG_MODEL_DEVICE: str = "cpu" + + # LinkedIn Credentials + LINKEDIN_USERNAME: str | None = None + LINKEDIN_PASSWORD: str | None = None @property def OPENAI_MAX_TOKEN_WINDOW(self) -> int: @@ -71,7 +76,7 @@ def OPENAI_MAX_TOKEN_WINDOW(self) -> int: "gpt-4-turbo": 128000, "gpt-4o": 128000, "gpt-4o-mini": 128000, - }[self.OPENAI_MODEL_ID] + }.get(self.OPENAI_MODEL_ID, 128000) max_token_window = int(official_max_token_window * 0.90) diff --git a/poetry.lock b/poetry.lock index 861a3fe..6c74537 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3248,6 +3248,17 @@ bcrypt = ["bcrypt (>=3.1.0)"] build-docs = ["cloud-sptheme (>=1.10.1)", "sphinx (>=1.6)", "sphinxcontrib-fulltoc (>=1.2.0)"] totp = ["cryptography"] +[[package]] +name = "pastel" +version = "0.2.1" +description = "Bring colors to your terminal." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "pastel-0.2.1-py2.py3-none-any.whl", hash = "sha256:4349225fcdf6c2bb34d483e523475de5bb04a5c10ef711263452cb37d7dd4364"}, + {file = "pastel-0.2.1.tar.gz", hash = "sha256:e6581ac04e973cac858828c6202c1e1e81fee1dc7de7683f3e1ffe0bfd8a573d"}, +] + [[package]] name = "pathos" version = "0.3.2" @@ -3429,6 +3440,24 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "poethepoet" +version = "0.29.0" +description = "A task runner that works well with poetry." +optional = false +python-versions = ">=3.8" +files = [ + {file = "poethepoet-0.29.0-py3-none-any.whl", hash = "sha256:f8dfe55006dcfb5cf31bcb1904e1262e1c642a4502fee3688cbf1bddfe5c7601"}, + {file = "poethepoet-0.29.0.tar.gz", hash = "sha256:676842302f2304a86b31ac56398dd672fae8471128d2086896393384dbafc095"}, +] + +[package.dependencies] +pastel = ">=0.2.1,<0.3.0" +pyyaml = ">=6.0.2,<7.0.0" + +[package.extras] +poetry-plugin = ["poetry (>=1.0,<2.0)"] + [[package]] name = "pooch" version = "1.8.2" @@ -6386,4 +6415,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = "~3.11" -content-hash = "3072d16a33f3cb313338c9d84c95ec484845b67c331ab0a5c3d3c0bc3272e2fe" +content-hash = "0cf3b060758d341f5a07be3c71a354e402071a01fae2a4bfefb85e783eb9535b" diff --git a/pyproject.toml b/pyproject.toml index 3e786aa..3823e7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ uvicorn = "^0.30.6" # trl = "0.9.6" # bitsandbytes = "0.42" # comet-ml = "3.44.3" +poethepoet = "0.29.0" [tool.poetry.group.dev.dependencies]