From bb123d1563a25eddce6d893e3e7a9f8afd054085 Mon Sep 17 00:00:00 2001 From: SeanHH86 <154984842+SeanHH86@users.noreply.github.com> Date: Fri, 3 Jan 2025 22:05:13 +0800 Subject: [PATCH] support upload folder (#66) * support upload folder * support upload file to folder --- README.md | 38 ++++++++++++++++++--- README_cn.md | 38 ++++++++++++++++++--- pycsghub/cli.py | 40 ++++++++++++++++------ pycsghub/cmd/repo.py | 57 +++++++++++++++++++++++++------ pycsghub/constants.py | 3 ++ pycsghub/file_upload.py | 4 ++- pycsghub/repository.py | 74 ++++++++++++++++++++++++----------------- setup.py | 2 +- 8 files changed, 192 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index d6ed59a..12b4fc8 100644 --- a/README.md +++ b/README.md @@ -91,11 +91,17 @@ csghub-cli download wanghh2000/myprivate1 # donwload dataset csghub-cli download wanghh2000/myds1 -t dataset -# upload a single file -csghub-cli upload wanghh2000/myprivate1 abc/3.txt +# upload a single file to folder1 +csghub-cli upload wanghh2000/myprivate1 abc/3.txt folder1 -# upload files -csghub-cli upload wanghh2000/myds1 abc/4.txt abc/5.txt -t dataset +# upload local folder '/Users/hhwang/temp/jsonl' to root path of repo 'wanghh2000/m01' with default branch +csghub-cli upload wanghh2000/m01 /Users/hhwang/temp/jsonl + +# upload local folder '/Users/hhwang/temp/jsonl' to path 'test/files' of repo 'wanghh2000/m01' with branch v1 +csghub-cli upload wanghh2000/m01 /Users/hhwang/temp/jsonl test/files --revision v1 + +# upload local folder '/Users/hhwang/temp/jsonl' to path 'test/files' of repo 'wanghh2000/m01' with token 'xxxxxx' +csghub-cli upload wanghh2000/m01 /Users/hhwang/temp/jsonl test/files -k xxxxxx ``` Download location is `~/.cache/csg/` by default. @@ -184,7 +190,7 @@ for item in repo_files: http_upload_file(repo_id=repo_id, repo_type=repo_type, file_path=item, endpoint=endpoint, token=token) ``` -### Upload repo +### Upload the local path to repo Before starting, please make sure you have Git-LFS installed (see [here](https://git-lfs.github.com/) for installation instructions). @@ -204,6 +210,28 @@ r = Repository( r.upload() ``` +### Upload the local path to the specified path in the repo + +Before starting, please make sure you have Git-LFS installed (see [here](https://git-lfs.github.com/) for installation instructions). + +```python +from pycsghub.repository import Repository + +token = "your access token" + +r = Repository( + repo_id="wanghh2000/model01", + upload_path="/Users/hhwang/temp/jsonl", + path_in_repo="test/abc", + user_name="wanghh2000", + token=token, + repo_type="model", + branch_name="v1", +) + +r.upload() +``` + ### Model loading compatible with huggingface The transformers library supports directly inputting the repo_id from Hugging Face to download and load related models, as shown below: diff --git a/README_cn.md b/README_cn.md index 1a1fb25..7cfdb13 100644 --- a/README_cn.md +++ b/README_cn.md @@ -90,11 +90,17 @@ csghub-cli download wanghh2000/myprivate1 # 数据集下载 csghub-cli download wanghh2000/myds1 -t dataset -# 上传单个文件 -csghub-cli upload wanghh2000/myprivate1 abc/3.txt +# 上传单个文件到仓库目录folder1 +csghub-cli upload wanghh2000/myprivate1 abc/3.txt folder1 -# 上传多个文件 -csghub-cli upload wanghh2000/myds1 abc/4.txt abc/5.txt -t dataset +# 上传本地目录'/Users/hhwang/temp/jsonl'到仓库'wanghh2000/m01'的默认分支根目录下 +csghub-cli upload wanghh2000/m01 /Users/hhwang/temp/jsonl + +# 上传本地目录'/Users/hhwang/temp/jsonl'到仓库'wanghh2000/m01'的v1分支的'test/files'目录下 +csghub-cli upload wanghh2000/m01 /Users/hhwang/temp/jsonl test/files --revision v1 + +# 上传本地目录'/Users/hhwang/temp/jsonl'到仓库'wanghh2000/m01'的默认分支'test/files'目录下并使用指定token +csghub-cli upload wanghh2000/m01 /Users/hhwang/temp/jsonl test/files -k xxxxxx ``` 文件默认下载路径为`~/.cache/csg/` @@ -184,7 +190,7 @@ for item in repo_files: http_upload_file(repo_id=repo_id, repo_type=repo_type, file_path=item, endpoint=endpoint, token=token) ``` -### 上传仓库 +### 上传本地目录到仓库 在开始之前,请确保您已安装 Git-LFS(安装说明请参见 [这里](https://git-lfs.github.com/))。 @@ -204,6 +210,28 @@ r = Repository( r.upload() ``` +### 上传本地目录到仓库的指定目录 + +在开始之前,请确保您已安装 Git-LFS(安装说明请参见 [这里](https://git-lfs.github.com/))。 + +```python +from pycsghub.repository import Repository + +token = "your access token" + +r = Repository( + repo_id="wanghh2000/model01", + upload_path="/Users/hhwang/temp/jsonl", + path_in_repo="test/abc", + user_name="wanghh2000", + token=token, + repo_type="model", + branch_name="v1", +) + +r.upload() +``` + ### 兼容huggingface的模型加载 huggingface的transformers库支持直接输入huggingface上的repo_id以下载并读取相关模型,如下列所示: diff --git a/pycsghub/cli.py b/pycsghub/cli.py index e19ddbf..32a4535 100644 --- a/pycsghub/cli.py +++ b/pycsghub/cli.py @@ -1,4 +1,5 @@ import typer +import os from typing import Annotated, List, Optional from pycsghub.cmd import repo from pycsghub.cmd.repo_types import RepoType @@ -15,11 +16,13 @@ def version_callback(value: bool): OPTIONS = { "repoID": typer.Argument(help="The ID of the repo. (e.g. `username/repo-name`)."), - "repoFiles": typer.Argument(help="Local path to the file or files to upload. Defaults to the relative path of the file of repo of OpenCSG Hub."), + "localPath": typer.Argument(help="Local path to the file or folder to upload. Defaults to the relative path of the file of repo of OpenCSG Hub."), + "pathInRepo": typer.Argument(help="Path of the folder in the repo. Defaults to the relative path of the file or folder."), "repoType": typer.Option("-t", "--repo-type", help="Specify the repository type."), "revision": typer.Option("-r", "--revision", help="An optional Git revision id which can be a branch name"), "cache_dir": typer.Option("-cd", "--cache-dir", help="Path to the directory where to save the downloaded files."), "endpoint": typer.Option("-e", "--endpoint", help="The address of the request to be sent."), + "username": typer.Option("-u", "--username", help="Logon account of OpenCSG Hub."), "token": typer.Option("-k", "--token", help="A User Access Token generated from https://opencsg.com/settings/access-token"), "version": typer.Option(None, "-V", "--version", callback=version_callback, is_eager=True, help="Show the version and exit."), } @@ -45,20 +48,37 @@ def download( @app.command(name="upload", help="Upload repository files to opencsg.com.") def upload( repo_id: Annotated[str, OPTIONS["repoID"]], - repo_files: Annotated[List[str], OPTIONS["repoFiles"]], + local_path: Annotated[str, OPTIONS["localPath"]], + path_in_repo: Annotated[str, OPTIONS["pathInRepo"]] = "", repo_type: Annotated[RepoType, OPTIONS["repoType"]] = RepoType.MODEL, revision: Annotated[Optional[str], OPTIONS["revision"]] = DEFAULT_REVISION, endpoint: Annotated[Optional[str], OPTIONS["endpoint"]] = DEFAULT_CSGHUB_DOMAIN, token: Annotated[Optional[str], OPTIONS["token"]] = None, + user_name: Annotated[Optional[str], OPTIONS["username"]] = "", ): - repo.upload( - repo_id=repo_id, - repo_type=repo_type, - repo_files=repo_files, - revision=revision, - endpoint=endpoint, - token=token - ) + # File upload + if os.path.isfile(local_path): + repo.upload_files( + repo_id=repo_id, + repo_type=repo_type, + repo_file=local_path, + path_in_repo=path_in_repo, + revision=revision, + endpoint=endpoint, + token=token + ) + # Folder upload + else: + repo.upload_folder( + repo_id=repo_id, + repo_type=repo_type, + local_path=local_path, + path_in_repo=path_in_repo, + revision=revision, + endpoint=endpoint, + token=token, + user_name=user_name + ) @app.callback(invoke_without_command=True) def main(version: bool = OPTIONS["version"]): diff --git a/pycsghub/cmd/repo.py b/pycsghub/cmd/repo.py index 5da0cf4..74952b0 100644 --- a/pycsghub/cmd/repo.py +++ b/pycsghub/cmd/repo.py @@ -4,6 +4,7 @@ from typing import Optional, Union, List from pycsghub.constants import DEFAULT_REVISION import requests +from pycsghub.repository import Repository def download( repo_id: str, @@ -22,20 +23,54 @@ def download( token=token, ) -def upload( +def upload_files( repo_id: str, repo_type: str, - repo_files: List[str], + repo_file: str, + path_in_repo: Optional[str] = "", revision: Optional[str] = DEFAULT_REVISION, endpoint: Optional[str] = None, token: Optional[str] = None ): - for item in repo_files: - http_upload_file( - repo_id=repo_id, - repo_type=repo_type, - file_path=item, - revision=revision, - endpoint=endpoint, - token=token, - ) + http_upload_file( + repo_id=repo_id, + repo_type=repo_type, + file_path=repo_file, + path_in_repo=path_in_repo, + revision=revision, + endpoint=endpoint, + token=token, + ) + +def upload_folder( + repo_id: str, + repo_type: str, + local_path: str, + path_in_repo: Optional[str] = "", + work_dir: Optional[str] = "/tmp/csg", + nickname: Optional[str] = "", + description: Optional[str] = "", + license: Optional[str] = "apache-2.0", + revision: Optional[str] = DEFAULT_REVISION, + endpoint: Optional[str] = None, + user_name: Optional[str] = "", + token: Optional[str] = None, + auto_create: Optional[bool] = True, + ): + r = Repository( + repo_id=repo_id, + upload_path=local_path, + path_in_repo=path_in_repo, + work_dir=work_dir, + repo_type=repo_type, + nickname=nickname, + description=description, + license=license, + branch_name=revision, + endpoint=endpoint, + user_name=user_name, + token=token, + auto_create=auto_create, + ) + r.upload() + diff --git a/pycsghub/constants.py b/pycsghub/constants.py index 953873e..8b821b4 100644 --- a/pycsghub/constants.py +++ b/pycsghub/constants.py @@ -86,3 +86,6 @@ S3_INTERNAL = os.environ.get("S3_INTERNAL", '') + +GIT_HIDDEN_DIR = ".git" +GIT_ATTRIBUTES_FILE = ".gitattributes" \ No newline at end of file diff --git a/pycsghub/file_upload.py b/pycsghub/file_upload.py index 76aff24..5c6ebbf 100644 --- a/pycsghub/file_upload.py +++ b/pycsghub/file_upload.py @@ -8,19 +8,21 @@ def http_upload_file( repo_id: str, repo_type: Optional[str] = None, file_path: str = None, + path_in_repo: Optional[str] = "", revision: Optional[str] = DEFAULT_REVISION, endpoint: Optional[str] = None, token: Optional[str] = None, ): if not os.path.exists(file_path): raise ValueError(f"file '{file_path}' does not exist") + destination_path = os.path.join(path_in_repo, os.path.basename(file_path)) if path_in_repo else file_path http_endpoint = endpoint if endpoint is not None else get_endpoint() if not http_endpoint.endswith("/"): http_endpoint += "/" http_url = http_endpoint + "api/v1/" + repo_type + "s/" + repo_id + "/upload_file" post_headers = build_csg_headers(token=token) file_data = {'file': open(file_path, 'rb')} - form_data = {'file_path': file_path, 'branch': revision, 'message': 'upload' + file_path} + form_data = {'file_path': destination_path, 'branch': revision, 'message': 'upload' + file_path} response = requests.post(http_url, headers=post_headers, data=form_data, files=file_data) if response.status_code == 200: print(f"file '{file_path}' upload successfully.") diff --git a/pycsghub/repository.py b/pycsghub/repository.py index ab5b492..392d208 100644 --- a/pycsghub/repository.py +++ b/pycsghub/repository.py @@ -8,16 +8,30 @@ import shutil import re from urllib.parse import urlparse -from pycsghub.constants import GIT_ATTRIBUTES_CONTENT, OPERATION_ACTION_GIT, REPO_TYPE_DATASET, REPO_TYPE_SPACE, REPO_TYPE_CODE +from pycsghub.constants import (GIT_ATTRIBUTES_CONTENT, + OPERATION_ACTION_GIT, + REPO_TYPE_DATASET, + REPO_TYPE_SPACE, + REPO_TYPE_CODE) +from pycsghub.constants import (GIT_HIDDEN_DIR, GIT_ATTRIBUTES_FILE) from pycsghub.utils import (build_csg_headers, model_id_to_group_owner_name, get_endpoint) +def ignore_folders(folder, contents): + ignored = [] + exclude_list = [GIT_HIDDEN_DIR] + for item in contents: + if item in exclude_list: + ignored.append(item) + return ignored + class Repository: def __init__( self, repo_id: str, upload_path: str, + path_in_repo: Optional[str] = "", branch_name: Optional[str] = "main", work_dir: Optional[str] = "/tmp/csg", user_name: Optional[str] = "", @@ -28,10 +42,10 @@ def __init__( repo_type: Optional[str] = None, endpoint: Optional[str] = None, auto_create: Optional[bool] = True, - copy_files: Optional[bool] = True, ): self.repo_id = repo_id self.upload_path = upload_path + self.path_in_repo = path_in_repo self.branch_name = branch_name self.work_dir = work_dir self.user_name = user_name @@ -42,10 +56,10 @@ def __init__( self.repo_type = repo_type self.endpoint = endpoint self.auto_create = auto_create - self.copy_files = copy_files self.repo_url_prefix = self.get_url_prefix() self.namespace, self.name = model_id_to_group_owner_name(model_id=self.repo_id) self.repo_dir = os.path.join(self.work_dir, self.name) + self.user_name = self.user_name if self.user_name else self.namespace def get_url_prefix(self): if self.repo_type == REPO_TYPE_DATASET: @@ -85,35 +99,33 @@ def upload(self) -> None: def copy_repo_files(self): from_path = "" git_cmd_workdir = "" - if self.copy_files or os.path.isfile(self.upload_path): - from_path = self.upload_path - git_cmd_workdir = self.repo_dir - - for item in os.listdir(git_cmd_workdir): - item_path = os.path.join(git_cmd_workdir, item) - if item != '.git' and item != '.gitattributes': - if os.path.isfile(item_path): - os.remove(item_path) - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - - if os.path.isfile(self.upload_path): - shutil.copyfile(self.upload_path, git_cmd_workdir) - else: - shutil.copytree(from_path, git_cmd_workdir, dirs_exist_ok=True) - else: - from_path = self.repo_dir - git_cmd_workdir = self.upload_path - - for item in os.listdir(from_path): - item_path = os.path.join(from_path, item) - if item == '.git' or item == '.gitattributes': - if os.path.isdir(item_path): - shutil.copytree(item_path, os.path.join(git_cmd_workdir, item), dirs_exist_ok=True) - else: - shutil.copy2(item_path, git_cmd_workdir) - return git_cmd_workdir + from_path = self.upload_path + git_cmd_workdir = self.repo_dir + destination_path = git_cmd_workdir + + path_suffix = f"{self.path_in_repo.strip('/')}/" if self.path_in_repo else "" + path_suffix = re.sub(r'^\./', '', path_suffix) + + destination_path = os.path.join(destination_path, path_suffix) + + if not os.path.exists(destination_path): + os.makedirs(destination_path, exist_ok=True) + + for item in os.listdir(destination_path): + item_path = os.path.join(destination_path, item) + if item != GIT_HIDDEN_DIR and item != GIT_ATTRIBUTES_FILE: + if os.path.isfile(item_path): + os.remove(item_path) + elif os.path.isdir(item_path): + shutil.rmtree(item_path) + + if os.path.isfile(self.upload_path): + shutil.copyfile(self.upload_path, destination_path) + else: + shutil.copytree(from_path, destination_path, dirs_exist_ok=True, ignore=ignore_folders) + + return git_cmd_workdir def auto_create_repo_and_branch(self): repoExist, branchExist = self.repo_exists() diff --git a/setup.py b/setup.py index ee4e5cd..c2ffb8d 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='csghub-sdk', - version='0.4.6', + version='0.4.7', author="opencsg", author_email="contact@opencsg.com", long_description=long_description,