diff --git a/.semversioner/next-release/patch-20250116150133634829.json b/.semversioner/next-release/patch-20250116150133634829.json new file mode 100644 index 0000000000..916cb10ae9 --- /dev/null +++ b/.semversioner/next-release/patch-20250116150133634829.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Adding escape and quote characters to the pandas read_csv logic used by the csv file loader." +} diff --git a/docs/config/env_vars.md b/docs/config/env_vars.md index 4529dcdf4a..47e7ae8229 100644 --- a/docs/config/env_vars.md +++ b/docs/config/env_vars.md @@ -19,7 +19,7 @@ If the embedding target is `all`, and you want to only embed a subset of these f ## Input Data -Our pipeline can ingest .csv or .txt data from an input folder. These files can be nested within subfolders. To configure how input data is handled, what fields are mapped over, and how timestamps are parsed, look for configuration values starting with `GRAPHRAG_INPUT_` below. In general, CSV-based data provides the most customizability. Each CSV should at least contain a `text` field (which can be mapped with environment variables), but it's helpful if they also have `title`, `timestamp`, and `source` fields. Additional fields can be included as well, which will land as extra fields on the `Document` table. +Our pipeline can ingest .csv or .txt data from an input folder. These files can be nested within subfolders. To configure how input data is handled, what fields are mapped over, and how timestamps are parsed, look for configuration values starting with `GRAPHRAG_INPUT_` below. In general, CSV-based data provides the most customizability. Each CSV should at least contain a `text` field (which can be mapped with environment variables), but it's helpful if they also have `title`, `timestamp`, and `source` fields. Additional fields can be included as well, which will land as extra fields on the `Document` table. The pipeline assumes a backslash (\) is used for the escape character and that the quote character is a double quote ("). ## Base LLM Settings diff --git a/graphrag/index/input/csv.py b/graphrag/index/input/csv.py index ce1fa20bfd..5d7021a490 100644 --- a/graphrag/index/input/csv.py +++ b/graphrag/index/input/csv.py @@ -35,7 +35,12 @@ async def load_file(path: str, group: dict | None) -> pd.DataFrame: if group is None: group = {} buffer = BytesIO(await storage.get(path, as_bytes=True)) - data = pd.read_csv(buffer, encoding=config.encoding or "latin-1") + data = pd.read_csv( + buffer, + encoding=config.encoding or "latin-1", + escapechar="\\", + quotechar='"', + ) additional_keys = group.keys() if len(additional_keys) > 0: data[[*additional_keys]] = data.apply( diff --git a/poetry.lock b/poetry.lock index c2d5d71298..a68173cd86 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. [[package]] name = "aiofiles" @@ -788,7 +788,6 @@ files = [ {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:761817a3377ef15ac23cd7834715081791d4ec77f9297ee694ca1ee9c2c7e5eb"}, {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3c672a53c0fb4725a29c303be906d3c1fa99c32f58abe008a82705f9ee96f40b"}, {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:4ac4c9f37eba52cb6fbeaf5b59c152ea976726b865bd4cf87883a7e7006cc543"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:60eb32934076fa07e4316b7b2742fa52cbb190b42c2df2863dbc4230a0a9b385"}, {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ed3534eb1090483c96178fcb0f8893719d96d5274dfde98aa6add34614e97c8e"}, {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f3f6fdfa89ee2d9d496e2c087cebef9d4fcbb0ad63c40e821b39f74bf48d9c5e"}, {file = "cryptography-44.0.0-cp37-abi3-win32.whl", hash = "sha256:eb33480f1bad5b78233b0ad3e1b0be21e8ef1da745d8d2aecbb20671658b9053"}, @@ -799,7 +798,6 @@ files = [ {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c5eb858beed7835e5ad1faba59e865109f3e52b3783b9ac21e7e47dc5554e289"}, {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f53c2c87e0fb4b0c00fa9571082a057e37690a8f12233306161c8f4b819960b7"}, {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:9e6fc8a08e116fb7c7dd1f040074c9d7b51d74a8ea40d4df2fc7aa08b76b9e6c"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:9abcc2e083cbe8dde89124a47e5e53ec38751f0d7dfd36801008f316a127d7ba"}, {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:d2436114e46b36d00f8b72ff57e598978b37399d2786fd39793c36c6d5cb1c64"}, {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a01956ddfa0a6790d594f5b34fc1bfa6098aca434696a03cfdbe469b8ed79285"}, {file = "cryptography-44.0.0-cp39-abi3-win32.whl", hash = "sha256:eca27345e1214d1b9f9490d200f9db5a874479be914199194e746c893788d417"}, @@ -5209,4 +5207,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "3359856d17fe2df1cb81270300efa552fff2971e4896480270daedd443695865" +content-hash = "0b7df8d1a42bc85d4da829c955d0e9ebf66d271a7e851da46b333e90692f3a4e" diff --git a/pyproject.toml b/pyproject.toml index 9fd943ea2c..6673683965 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,6 +90,7 @@ tenacity = "^9.0.0" json-repair = "^0.30.3" tqdm = "^4.67.1" httpx = "^0.28.1" +semversioner = "^2.0.5" [tool.poetry.group.dev.dependencies] coverage = "^7.6.9"