From f7e5f89f342c76a371e145d205d7bc81de608c0b Mon Sep 17 00:00:00 2001 From: ahmetkca Date: Mon, 11 Dec 2023 04:02:10 -0500 Subject: [PATCH 1/2] Add Azure DevOps integration --- llama_hub/github_repo/README.md | 48 +++++ llama_hub/github_repo/azure_devops.py | 246 +++++++++++++++++++++++++ llama_hub/github_repo/base.py | 23 ++- llama_hub/github_repo/requirements.txt | 3 +- 4 files changed, 312 insertions(+), 8 deletions(-) create mode 100644 llama_hub/github_repo/azure_devops.py diff --git a/llama_hub/github_repo/README.md b/llama_hub/github_repo/README.md index 1bb2e8e1d9..809f3a6dac 100644 --- a/llama_hub/github_repo/README.md +++ b/llama_hub/github_repo/README.md @@ -39,6 +39,54 @@ for doc in docs: print(doc.extra_info) ``` +### Azure DevOps + +```bash +export AZURE_DEVOPS_BASEURL='...' +export AZURE_DEVOPS_USERNAME='...' +export AZURE_DEVOPS_PASSWORD='...' +``` + +```python +import os + +from llama_index import download_loader +download_loader("GithubRepositoryReader") + +from llama_hub.github_repo import GithubRepositoryReader, AzureDevOpsAdapter + +# Example: https://dev.azure.com/ahmetkarapinar/testProject/_git/testProject/commit/08633d3844192a69ab5011c20201dba3aced0a41?refName=refs%2Fheads%2Fmaster +# 'ahmetkarapinar' is organization id +# 'testProject' is project id +# 'testProject' is repository id +# '08633d3844192a69ab5011c20201dba3aced0a41' commit sha +# 'master' branch name + + +azure_devops_adapter = AzureDevOpsAdapter( + base_url=os.environ["AZURE_DEVOPS_BASE_URL"], # Ex. 'https://dev.azure.com/YOURORG' + username=os.environ["AZURE_DEVOPS_USERNAME"], + password=os.environ["AZURE_DEVOPS_PASSWORD"], +) + +loader = GithubRepositoryReader( + github_client = azure_devops_adapter, + owner = "", + repo = "", + filter_directories = (["llama_index", "docs"], GithubRepositoryReader.FilterType.INCLUDE), + filter_file_extensions = ([".py"], GithubRepositoryReader.FilterType.INCLUDE), + verbose = True, + concurrent_requests = 10, +) + +docs = loader.load_data(branch="main") +# alternatively, load from a specific commit: +# docs = loader.load_data(commit_sha="a6c89159bf8e7086bea2f4305cff3f0a4102e370") + +for doc in docs: + print(doc.extra_info) +``` + ## Examples This loader designed to be used as a way to load data into [Llama Index](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. diff --git a/llama_hub/github_repo/azure_devops.py b/llama_hub/github_repo/azure_devops.py new file mode 100644 index 0000000000..a78045952d --- /dev/null +++ b/llama_hub/github_repo/azure_devops.py @@ -0,0 +1,246 @@ +""" +Azure DevOps Client Adapter for BaseGithubClient. + +This class is used to interact with Azure DevOps repositories. It uses the azure-devops package. +The implementation is merely a workaround to use the same code for Github and Azure DevOps. +""" + +from typing import Any, Dict, List, Optional +from llama_hub.github_repo.github_client import ( + BaseGithubClient, + GitBlobResponseModel, + GitBranchResponseModel, + GitCommitResponseModel, + GitTreeResponseModel +) + +from azure.devops.v7_0.git.git_client import GitClient +from azure.devops.v7_0.git.models import GitTreeRef +from azure.devops.v7_0.git.models import GitTreeEntryRef +from azure.devops.v7_0.git.models import GitBlobRef +from azure.devops.v7_0.git.models import GitCommit +from azure.devops.v7_0.git.models import GitBranchStats + + +class AzureDevOpsAdapter(BaseGithubClient): + """ + Azure DevOps adapter. + + This class is used to interact with Azure DevOps repositories. It uses the azure-devops package. + Each method is same as the corresponding method in BaseGithubClient. All of the Azure DevOps specific + response models are converted to the corresponding Github response models. + + Args: + - `base_url (str)`: Azure DevOps base url. Example: 'https://dev.azure.com/YOURORG' + - `username (str)`: Azure DevOps username. You can leave this blank if you are using a PAT. ex: '' + - `password (str)`: Azure DevOps password. Personal Access Token (PAT) is recommended. + + Raises: + - `ImportError`: If azure-devops package is not installed. + - `ValueError`: If base_url, username or password is not provided. + """ + def __init__(self, + *args: Any, + **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + try: + from azure.devops.connection import Connection + from msrest.authentication import BasicAuthentication + except ImportError: + raise ImportError( + "Please install azure-devops package to use Azure DevOps adapter" + ) + if kwargs.get("base_url") is None: + raise ValueError("Azure DevOps base_url is required. Example: 'https://dev.azure.com/YOURORG'") + if kwargs.get("username") is None: + raise ValueError("Azure DevOps username is required. You can leave this blank if you are using a PAT. ex: ''") + if kwargs.get("password") is None: + raise ValueError("Azure DevOps password is required. Personal Access Token (PAT) is recommended.") + + + self.connection = Connection( + base_url=kwargs.get("base_url"), + creds=BasicAuthentication( + username=kwargs.get("username"), + password=kwargs.get("password"), + ), + ) + self._git_client: GitClient = self.connection.clients.get_git_client() + + def get_all_endpoints(self) -> Dict[str, str]: + raise NotImplementedError + + async def request( + self, + endpoint: str, + method: str, + headers: Dict[str, Any] = {}, + **kwargs: Any, + ) -> Any: + raise NotImplementedError + + async def get_tree( + self, + owner: str, + repo: str, + tree_sha: str, + ) -> GitTreeResponseModel: + """ + Get the tree for a given sha. + + Args: + - `owner (str)`: Project name or project id. + - `repo (str)`: repository id. + - `tree_sha (str)`: sha of the tree. + + Returns: + - `tree (GitTreeResponseModel)`: Tree response model. + """ + _git_tree_response: GitTreeRef = self._git_client.get_tree( + repository_id=repo, + sha1=tree_sha, + project=owner, + ) + + git_tree_object_list: List[GitTreeResponseModel.GitTreeObject] = [] + tree_entry: GitTreeEntryRef + for tree_entry in _git_tree_response.tree_entries: + git_tree_object: GitTreeResponseModel.GitTreeObject = GitTreeResponseModel.GitTreeObject( + path=tree_entry.relative_path, + mode=tree_entry.mode, + type=tree_entry.git_object_type, + sha=tree_entry.object_id, + url=tree_entry.url, + size=tree_entry.size, + ) + git_tree_object_list.append(git_tree_object) + return GitTreeResponseModel( + sha=_git_tree_response.object_id, + url=_git_tree_response.url, + tree=git_tree_object_list, + truncated=False, + ) + + + async def get_blob( + self, + owner: str, + repo: str, + file_sha: str, + ) -> GitBlobResponseModel: + """ + Get the blob for a given sha. + + Args: + - `owner (str)`: Project name or project id. + - `repo (str)`: repository id. + - `file_sha (str)`: sha of the blob. + + Returns: + - `blob (GitBlobResponseModel)`: Blob response model. + """ + _git_blob_response: GitBlobRef = self._git_client.get_blob( + repository_id=repo, + sha1=file_sha, + project=owner, + download=False, + resolve_lfs=False, + ) + + _git_blob_content_iterator = self._git_client.get_blob_content( + repository_id=repo, + sha1=file_sha, + project=owner, + download=False, + resolve_lfs=False, + ) + + size = 0 + _git_blob_content: bytes = b"" + for chunk in _git_blob_content_iterator: + _git_blob_content += chunk + size += len(chunk) + + return GitBlobResponseModel( + content=_git_blob_content, + size=size, + encoding="utf-8", + sha=_git_blob_response.object_id, + url=_git_blob_response.url, + node_id=None + ) + + async def get_commit( + self, + owner: str, + repo: str, + commit_sha: str, + ) -> GitCommitResponseModel: + """ + Get the commit for a given sha. + + Args: + - `owner (str)`: Project name or project id. + - `repo (str)`: repository id. + - `commit_sha (str)`: sha of the commit. + + Returns: + - `commit (GitCommitResponseModel)`: Commit response model. + """ + _git_commit_response: GitCommit = self._git_client.get_commit( + repository_id=repo, + commit_id=commit_sha, + project=owner, + ) + + return GitCommitResponseModel( + url=_git_commit_response.url, + sha=_git_commit_response.commit_id, + commit=GitCommitResponseModel.Commit( + tree=GitCommitResponseModel.Commit.Tree( + sha=_git_commit_response.tree_id, + ), + )) + + async def get_branch( + self, + owner: str, + repo: str, + branch: Optional[str], + branch_name: Optional[str], + ) -> GitBranchResponseModel: + """ + Get the branch for a given branch name. + + Args: + - `owner (str)`: Project name or project id. + - `repo (str)`: repository id. + - `branch (str)`: branch name. + + Returns: + - `branch (GitBranchResponseModel)`: Branch response model. + """ + _git_branch_response: GitBranchStats = self._git_client.get_branch( + repository_id=repo, + project=owner, + name=branch + ) + + # get the latest commit for the branch + _git_commit_response: GitCommit = self._git_client.get_commit( + repository_id=repo, + commit_id=_git_branch_response.commit.commit_id, + project=owner, + ) + + return GitBranchResponseModel( + name=_git_branch_response.name, + commit=GitBranchResponseModel.Commit( + commit=GitBranchResponseModel.Commit.Commit( + tree=GitBranchResponseModel.Commit.Commit.Tree( + sha=_git_commit_response.tree_id, + ), + ), + ), + _links=None, + ) \ No newline at end of file diff --git a/llama_hub/github_repo/base.py b/llama_hub/github_repo/base.py index d2097b799b..6328bd0345 100644 --- a/llama_hub/github_repo/base.py +++ b/llama_hub/github_repo/base.py @@ -18,6 +18,8 @@ from llama_index.readers.base import BaseReader from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS from llama_index.readers.schema.base import Document +from llama_hub.github_repo import github_client +from llama_hub.github_repo.azure_devops import AzureDevOpsAdapter from llama_hub.github_repo.github_client import ( BaseGithubClient, @@ -247,7 +249,7 @@ def _load_data_from_branch(self, branch: str) -> List[Document]: :return: list of documents """ branch_data: GitBranchResponseModel = self._loop.run_until_complete( - self._github_client.get_branch(self._owner, self._repo, branch) + self._github_client.get_branch(self._owner, self._repo, branch, branch) ) tree_sha = branch_data.commit.commit.tree.sha @@ -393,7 +395,7 @@ async def _generate_documents( async for blob_data, full_path in buffered_iterator: print_if_verbose(self._verbose, f"generating document for {full_path}") assert ( - blob_data.encoding == "base64" + blob_data.encoding == "base64" or blob_data.encoding == "utf-8" ), f"blob encoding {blob_data.encoding} not supported" decoded_bytes = None try: @@ -403,7 +405,14 @@ async def _generate_documents( print_if_verbose( self._verbose, f"could not decode {full_path} as base64" ) - continue + # tried to decode the content that was base64 encoded but failed + # continue + if blob_data.encoding == "base64": + continue + # if the content was not base64 encoded and we failed to decode it + # as base64, then we assume it is raw text + decoded_bytes = blob_data.content + if self._use_parser: document = self._parse_supported_file( @@ -547,7 +556,7 @@ def wrapper(*args: Any, **kwargs: Any) -> None: verbose=True, filter_directories=( ["docs"], - GithubRepositoryReader.FilterType.INCLUDE, + GithubRepositoryReader.FilterType.EXCLUDE, ), filter_file_extensions=( [ @@ -557,7 +566,7 @@ def wrapper(*args: Any, **kwargs: Any) -> None: ".gif", ".svg", ".ico", - "json", + ".json", ".ipynb", ], GithubRepositoryReader.FilterType.EXCLUDE, @@ -584,6 +593,6 @@ def load_data_from_branch() -> None: load_data_from_branch() - # input("Press enter to load github repository from commit sha...") + input("Press enter to load github repository from commit sha...") - # load_data_from_commit() + load_data_from_commit() diff --git a/llama_hub/github_repo/requirements.txt b/llama_hub/github_repo/requirements.txt index 79228389fc..7dd09db67d 100644 --- a/llama_hub/github_repo/requirements.txt +++ b/llama_hub/github_repo/requirements.txt @@ -1 +1,2 @@ -httpx \ No newline at end of file +httpx +azure-devops \ No newline at end of file From 0e01c162f45f68afa0acd1ffcfb41a25f9dac376 Mon Sep 17 00:00:00 2001 From: ahmetkca Date: Mon, 11 Dec 2023 04:06:33 -0500 Subject: [PATCH 2/2] Refactor Azure DevOps adapter and fix code formatting issues --- llama_hub/github_repo/azure_devops.py | 56 ++++++++++++++------------- llama_hub/github_repo/base.py | 4 +- 2 files changed, 31 insertions(+), 29 deletions(-) diff --git a/llama_hub/github_repo/azure_devops.py b/llama_hub/github_repo/azure_devops.py index a78045952d..324e2afeca 100644 --- a/llama_hub/github_repo/azure_devops.py +++ b/llama_hub/github_repo/azure_devops.py @@ -7,11 +7,11 @@ from typing import Any, Dict, List, Optional from llama_hub.github_repo.github_client import ( - BaseGithubClient, - GitBlobResponseModel, - GitBranchResponseModel, - GitCommitResponseModel, - GitTreeResponseModel + BaseGithubClient, + GitBlobResponseModel, + GitBranchResponseModel, + GitCommitResponseModel, + GitTreeResponseModel, ) from azure.devops.v7_0.git.git_client import GitClient @@ -39,9 +39,8 @@ class AzureDevOpsAdapter(BaseGithubClient): - `ImportError`: If azure-devops package is not installed. - `ValueError`: If base_url, username or password is not provided. """ - def __init__(self, - *args: Any, - **kwargs: Any) -> None: + + def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) try: from azure.devops.connection import Connection @@ -51,12 +50,17 @@ def __init__(self, "Please install azure-devops package to use Azure DevOps adapter" ) if kwargs.get("base_url") is None: - raise ValueError("Azure DevOps base_url is required. Example: 'https://dev.azure.com/YOURORG'") + raise ValueError( + "Azure DevOps base_url is required. Example: 'https://dev.azure.com/YOURORG'" + ) if kwargs.get("username") is None: - raise ValueError("Azure DevOps username is required. You can leave this blank if you are using a PAT. ex: ''") + raise ValueError( + "Azure DevOps username is required. You can leave this blank if you are using a PAT. ex: ''" + ) if kwargs.get("password") is None: - raise ValueError("Azure DevOps password is required. Personal Access Token (PAT) is recommended.") - + raise ValueError( + "Azure DevOps password is required. Personal Access Token (PAT) is recommended." + ) self.connection = Connection( base_url=kwargs.get("base_url"), @@ -105,13 +109,15 @@ async def get_tree( git_tree_object_list: List[GitTreeResponseModel.GitTreeObject] = [] tree_entry: GitTreeEntryRef for tree_entry in _git_tree_response.tree_entries: - git_tree_object: GitTreeResponseModel.GitTreeObject = GitTreeResponseModel.GitTreeObject( - path=tree_entry.relative_path, - mode=tree_entry.mode, - type=tree_entry.git_object_type, - sha=tree_entry.object_id, - url=tree_entry.url, - size=tree_entry.size, + git_tree_object: GitTreeResponseModel.GitTreeObject = ( + GitTreeResponseModel.GitTreeObject( + path=tree_entry.relative_path, + mode=tree_entry.mode, + type=tree_entry.git_object_type, + sha=tree_entry.object_id, + url=tree_entry.url, + size=tree_entry.size, + ) ) git_tree_object_list.append(git_tree_object) return GitTreeResponseModel( @@ -121,7 +127,6 @@ async def get_tree( truncated=False, ) - async def get_blob( self, owner: str, @@ -167,7 +172,7 @@ async def get_blob( encoding="utf-8", sha=_git_blob_response.object_id, url=_git_blob_response.url, - node_id=None + node_id=None, ) async def get_commit( @@ -200,7 +205,8 @@ async def get_commit( tree=GitCommitResponseModel.Commit.Tree( sha=_git_commit_response.tree_id, ), - )) + ), + ) async def get_branch( self, @@ -221,9 +227,7 @@ async def get_branch( - `branch (GitBranchResponseModel)`: Branch response model. """ _git_branch_response: GitBranchStats = self._git_client.get_branch( - repository_id=repo, - project=owner, - name=branch + repository_id=repo, project=owner, name=branch ) # get the latest commit for the branch @@ -243,4 +247,4 @@ async def get_branch( ), ), _links=None, - ) \ No newline at end of file + ) diff --git a/llama_hub/github_repo/base.py b/llama_hub/github_repo/base.py index 6328bd0345..4e529ac652 100644 --- a/llama_hub/github_repo/base.py +++ b/llama_hub/github_repo/base.py @@ -19,7 +19,6 @@ from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS from llama_index.readers.schema.base import Document from llama_hub.github_repo import github_client -from llama_hub.github_repo.azure_devops import AzureDevOpsAdapter from llama_hub.github_repo.github_client import ( BaseGithubClient, @@ -407,13 +406,12 @@ async def _generate_documents( ) # tried to decode the content that was base64 encoded but failed # continue - if blob_data.encoding == "base64": + if blob_data.encoding == "base64": continue # if the content was not base64 encoded and we failed to decode it # as base64, then we assume it is raw text decoded_bytes = blob_data.content - if self._use_parser: document = self._parse_supported_file( file_path=full_path,