From bc44f698f17be5213b6c5dc3be13c10b1c293c20 Mon Sep 17 00:00:00 2001 From: Kartik Gupta <84975264+kartik-gupta-ij@users.noreply.github.com> Date: Fri, 17 Nov 2023 22:45:03 +0530 Subject: [PATCH] Add file search functionality by adding a separate collection for files (#5) * Add file search functionality by adding a separate collection for files * duplication error fixed * use file path from env + reformat --------- Co-authored-by: generall --- code_search/config.py | 1 + code_search/get_file.py | 38 ++++++ code_search/index/file_uploader.py | 42 +++++++ code_search/index/files_to_json.py | 41 +++++++ code_search/service.py | 8 ++ frontend/src/api/constants.ts | 2 + frontend/src/api/file.ts | 13 ++ .../CodeContainer/CodeContainer.module.css | 2 + .../src/components/CodeContainer/index.tsx | 112 +++++++++++------- frontend/src/hooks/useGetFile.ts | 46 +++++++ frontend/src/hooks/useGetSearchResult.ts | 2 - tools/index_qdrant.sh | 4 + 12 files changed, 268 insertions(+), 43 deletions(-) create mode 100644 code_search/get_file.py create mode 100644 code_search/index/file_uploader.py create mode 100644 code_search/index/files_to_json.py create mode 100644 frontend/src/api/file.ts create mode 100644 frontend/src/hooks/useGetFile.ts diff --git a/code_search/config.py b/code_search/config.py index 4b8c582..5f5b037 100644 --- a/code_search/config.py +++ b/code_search/config.py @@ -13,6 +13,7 @@ QDRANT_CODE_COLLECTION_NAME = "code-snippets-unixcoder" QDRANT_NLU_COLLECTION_NAME = "code-signatures" +QDRANT_FILE_COLLECTION_NAME="code-files" ENCODER_NAME = "all-MiniLM-L6-v2" ENCODER_SIZE = 384 diff --git a/code_search/get_file.py b/code_search/get_file.py new file mode 100644 index 0000000..2c66bdc --- /dev/null +++ b/code_search/get_file.py @@ -0,0 +1,38 @@ +from typing import List + +from qdrant_client import QdrantClient +from qdrant_client.http import models + +from code_search.config import QDRANT_URL, QDRANT_API_KEY, QDRANT_FILE_COLLECTION_NAME + +class FileGet: + + def __init__(self): + self.collection_name = QDRANT_FILE_COLLECTION_NAME + self.client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY) + + def get(self, path, limit=5) -> List[dict]: + result = self.client.scroll( + collection_name=self.collection_name, + scroll_filter=models.Filter( + must=[ + models.FieldCondition( + key="path", + match=models.MatchValue(value=path), + ) + ] + ), + limit=limit, + ) + + return [hit.payload for hit in result[0]] + + +if __name__ == '__main__': + path = "lib/collection/src/collection_manager/optimizers/indexing_optimizer.rs" + + searcher = FileGet() + + res = searcher.get(path) + for hit in res: + print(hit) diff --git a/code_search/index/file_uploader.py b/code_search/index/file_uploader.py new file mode 100644 index 0000000..dede9c5 --- /dev/null +++ b/code_search/index/file_uploader.py @@ -0,0 +1,42 @@ +from pathlib import Path +from qdrant_client import QdrantClient +import json + +from code_search.config import QDRANT_URL, QDRANT_API_KEY, DATA_DIR, QDRANT_FILE_COLLECTION_NAME + + +def encode_and_upload(): + qdrant_client = QdrantClient( + QDRANT_URL, + api_key=QDRANT_API_KEY, + ) + + collection_name = QDRANT_FILE_COLLECTION_NAME + input_file = Path(DATA_DIR) / "rs_files.json" + + if not input_file.exists(): + raise RuntimeError(f"File {input_file} does not exist. Skipping") + + payload = [] + with open(input_file, 'r') as json_file: + data = json.load(json_file) + payload = data + + print(f"Recreating the collection {collection_name}") + qdrant_client.recreate_collection( + collection_name=collection_name, + vectors_config={} + ) + + print(f"Storing data in the collection {collection_name}") + qdrant_client.upload_collection( + collection_name=collection_name, + payload=payload, + vectors=[{}] * len(payload), + ids=None, + batch_size=256 + ) + + +if __name__ == '__main__': + encode_and_upload() diff --git a/code_search/index/files_to_json.py b/code_search/index/files_to_json.py new file mode 100644 index 0000000..4f1a295 --- /dev/null +++ b/code_search/index/files_to_json.py @@ -0,0 +1,41 @@ +import os.path +import json +from pathlib import Path + +from code_search.config import DATA_DIR + + +def process_file(root_dir, file_path): + with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: + code_lines = file.readlines() + relative_path = os.path.relpath(file_path, root_dir) + return { + "path": relative_path, + "code": code_lines, + "startline": 1, + "endline": len(code_lines) + } + + +def explore_directory(root_dir): + result = [] + for foldername, subfolders, filenames in os.walk(root_dir): + for filename in filenames: + file_path = os.path.join(foldername, filename) + if file_path.endswith('.rs'): + result.append(process_file(root_dir, file_path)) + return result + + +def main(): + folder_path = os.getenv('QDRANT_PATH') + output_file = Path(DATA_DIR) / "rs_files.json" + + files_data = explore_directory(folder_path) + + with open(output_file, 'w', encoding='utf-8') as json_file: + json.dump(files_data, json_file, indent=2) + + +if __name__ == "__main__": + main() diff --git a/code_search/service.py b/code_search/service.py index 550eb5a..70e1ef7 100644 --- a/code_search/service.py +++ b/code_search/service.py @@ -5,10 +5,12 @@ from code_search.config import ROOT_DIR from code_search.searcher import CombinedSearcher +from code_search.get_file import FileGet app = FastAPI() searcher = CombinedSearcher() +get_file = FileGet() @app.get("/api/search") @@ -17,6 +19,12 @@ async def search(query: str): "result": searcher.search(query, limit=5) } +@app.get("/api/file") +async def file(path: str): + return { + "result": get_file.get(path) + } + app.mount("/", StaticFiles(directory=os.path.join(ROOT_DIR, 'frontend', 'dist'), html=True)) diff --git a/frontend/src/api/constants.ts b/frontend/src/api/constants.ts index c970a71..2c2f25a 100644 --- a/frontend/src/api/constants.ts +++ b/frontend/src/api/constants.ts @@ -1,3 +1,5 @@ const API_V1 = "api/"; export const SEARCH_URL = `${API_V1}search`; + +export const FILE_URL = `${API_V1}file`; diff --git a/frontend/src/api/file.ts b/frontend/src/api/file.ts new file mode 100644 index 0000000..1170931 --- /dev/null +++ b/frontend/src/api/file.ts @@ -0,0 +1,13 @@ +import { Axios } from "./axios"; +import { FILE_URL } from "./constants"; + +export type PathRequest = { + path: string; +}; + +export const getFileResult = (PathRequest: PathRequest) => { + const params = { + path: PathRequest.path, + }; + return Axios().get(FILE_URL, { params }); +}; diff --git a/frontend/src/components/CodeContainer/CodeContainer.module.css b/frontend/src/components/CodeContainer/CodeContainer.module.css index 5820737..fe04d9c 100644 --- a/frontend/src/components/CodeContainer/CodeContainer.module.css +++ b/frontend/src/components/CodeContainer/CodeContainer.module.css @@ -36,6 +36,8 @@ .codeLoad { padding-right: 1rem; padding-left: 1rem; + height: 24px; + width: 56px; color: #646d76; background-color: #bddfff; display: flex; diff --git a/frontend/src/components/CodeContainer/index.tsx b/frontend/src/components/CodeContainer/index.tsx index 8686c49..1b78447 100644 --- a/frontend/src/components/CodeContainer/index.tsx +++ b/frontend/src/components/CodeContainer/index.tsx @@ -1,4 +1,4 @@ -import { Box, Button, Image, ThemeIcon, Tooltip } from "@mantine/core"; +import { Box, Button, Image, Loader, ThemeIcon, Tooltip } from "@mantine/core"; import classes from "./CodeContainer.module.css"; import { Highlight, themes } from "prism-react-renderer"; import { @@ -7,6 +7,8 @@ import { IconFoldUp, } from "@tabler/icons-react"; import useMountedState from "@/hooks/useMountedState"; +import { useGetFile } from "@/hooks/useGetFile"; +import { useEffect } from "react"; type CodeContainerProps = { code_type: string; @@ -16,8 +18,6 @@ type CodeContainerProps = { module: string; snippet: string; struct_name: string; - upper_lines: string; - lower_lines: string; }; docstring: string | null; line: number; @@ -35,42 +35,61 @@ const loadCount = 10; export function CodeContainer(props: CodeContainerProps) { const { context, line_from, sub_matches, line_to } = props; const [codeLineFrom, setCodeLineFrom] = useMountedState(line_from); - const [codeLineTo, setCodeLineTo] = useMountedState(0); + const [codeLineTo, setCodeLineTo] = useMountedState(line_to); const [code, setCode] = useMountedState(props.context.snippet); + const { data, error, loading, getFile } = useGetFile(); + const [inStack, setInStack] = useMountedState< + "loadUpperCode" | "loadLowerCode" | null + >(null); const loadUpperCode = () => { - const upperCodeArray = context.upper_lines.split("\n"); - const upperCode = upperCodeArray - .slice( - codeLineFrom - loadCount + 1 > 0 ? codeLineFrom - loadCount + 1 : 0, - codeLineFrom - ) - .join("\n"); - setCodeLineFrom((number) => { - return number - loadCount > 0 ? number - loadCount : 1; - }); - setCode(`${upperCode}${code}`); + if (!data) { + getFile(context.file_path); + setInStack("loadUpperCode"); + } + if (data) { + const upperCodeArray = data.result[0].code; + const upperCode = upperCodeArray + .slice( + codeLineFrom - loadCount + 1 > 0 ? codeLineFrom - loadCount + 1 : 0, + codeLineFrom + ) + .join(""); + setCodeLineFrom((number) => { + return number - loadCount > 0 ? number - loadCount : 1; + }); + setCode(`${upperCode}${code}`); + } }; const loadLowerCode = () => { - const lowerCodeArray = context.lower_lines.split("\n"); - if (lowerCodeArray.length > codeLineTo + loadCount) { + if (!data) { + getFile(context.file_path); + setInStack("loadLowerCode"); + } + if (data) { + const lowerCodeArray = data.result[0].code; const lowerCode = lowerCodeArray - .slice(codeLineTo, codeLineTo + loadCount + 1) - .join("\n"); + .slice(codeLineTo, codeLineTo + loadCount) + .join(""); setCodeLineTo((number) => { return number + loadCount; }); setCode(`${code}${lowerCode}`); - } else { - const lowerCode = lowerCodeArray - .slice(codeLineTo, lowerCodeArray.length) - .join("\n"); - setCodeLineTo(lowerCodeArray.length); - setCode(`${code}${lowerCode}`); } }; + useEffect(() => { + if (inStack === "loadUpperCode" && data) { + loadUpperCode(); + setInStack(null); + } + if (inStack === "loadLowerCode" && data) { + loadLowerCode(); + setInStack(null); + } + }, [data]); + return ( - + {loading && inStack === "loadUpperCode" ? ( + + ) : ( + + )}
- @@ {1} - {codeLineFrom - 1} of {context.file_name} + {error + ? error + : `@@ 1 - ${codeLineFrom - 1} of ${context.file_name}`}
@@ -174,10 +199,7 @@ export function CodeContainer(props: CodeContainerProps) { ))}
= data?.result[0].endline ? { display: "none" } : { display: "flex", @@ -191,12 +213,12 @@ export function CodeContainer(props: CodeContainerProps) { } > - + {loading && inStack === "loadLowerCode" ? ( + + ) : ( + + )}
- @@ {line_to + codeLineTo + 2} -{" "} - {context.lower_lines.split("\n").length + line_to} of{" "} - {context.file_name} + {error + ? error + : `@@ ${codeLineTo + 2} - ${ + data?.result[0].endline + ? data?.result[0].endline + 1 + : "end" + } of ${context.file_name}`}
diff --git a/frontend/src/hooks/useGetFile.ts b/frontend/src/hooks/useGetFile.ts new file mode 100644 index 0000000..25265dd --- /dev/null +++ b/frontend/src/hooks/useGetFile.ts @@ -0,0 +1,46 @@ +import { StatusCodes } from "http-status-codes"; +import useMountedState from "./useMountedState"; +import { getFileResult } from "@/api/file"; + +export type searchResponse = { + result: { + code: string[]; + endline: number; + startline: number; + path: string; + }[]; +}; +export const useGetFile = () => { + const [data, setData] = useMountedState(null); + const [error, setError] = useMountedState(null); + const [loading, setLoading] = useMountedState(false); + + const getFile = async (path: string) => { + try { + setLoading(true); + setError(null); + const res = await getFileResult({ path }); + + switch (res.status) { + case StatusCodes.OK: { + const searchRes = res.data; + setData(searchRes); + break; + } + default: { + setError("Failed to get the file"); + } + } + } catch { + setError("Failed to get the file"); + } finally { + setLoading(false); + } + }; + + const resetData = () => { + setData(null); + }; + + return { data, error, loading, getFile, resetData }; +}; diff --git a/frontend/src/hooks/useGetSearchResult.ts b/frontend/src/hooks/useGetSearchResult.ts index eff3237..2466649 100644 --- a/frontend/src/hooks/useGetSearchResult.ts +++ b/frontend/src/hooks/useGetSearchResult.ts @@ -11,8 +11,6 @@ export type searchResponse = { module: string; snippet: string; struct_name: string; - upper_lines: string; - lower_lines: string; }; docstring: string | null; line: number; diff --git a/tools/index_qdrant.sh b/tools/index_qdrant.sh index f0e4a10..02e3d09 100644 --- a/tools/index_qdrant.sh +++ b/tools/index_qdrant.sh @@ -11,6 +11,10 @@ SCRIPT_PATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" ROOT_PATH=$SCRIPT_PATH/.. +python -m code_search.index.files_to_json + +python -m code_search.index.file_uploader + rustup run stable rust-analyzer -v lsif $QDRANT_PATH > $ROOT_PATH/data/index.lsif python -m code_search.index.convert_lsif_index