Skip to content

Commit

Permalink
refactor(index): migrate corpus::WEB to corpus::STRUCTURED_DOC (#3352)
Browse files Browse the repository at this point in the history
* add structured doc

* [autofix.ci] apply automated fixes

* chore: implement structured_doc::DocService

* refactor(index): refactored `web_crawler.rs` to use updated `StructuredDoc` and `StructuredDocFields` types.

run make fix

* switch doc search

* chore: adapt frontend

* delete doc related files

* run make fix

* add deprecation notes for corpus::WEB

* [autofix.ci] apply automated fixes

* [autofix.ci] apply automated fixes (attempt 2/3)

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
  • Loading branch information
wsxiaoys and autofix-ci[bot] authored Nov 14, 2024
1 parent 414cac3 commit a52c4e6
Show file tree
Hide file tree
Showing 39 changed files with 773 additions and 440 deletions.
46 changes: 0 additions & 46 deletions crates/tabby-common/src/api/doc.rs

This file was deleted.

2 changes: 1 addition & 1 deletion crates/tabby-common/src/api/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pub mod code;
pub mod doc;
pub mod event;
pub mod server_setting;
pub mod structured_doc;
169 changes: 169 additions & 0 deletions crates/tabby-common/src/api/structured_doc.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
use async_trait::async_trait;
use tantivy::{
schema::{self, document::CompactDocValue, Value},
TantivyDocument,
};
use thiserror::Error;

use crate::index::{structured_doc, IndexSchema};

pub struct DocSearchResponse {
pub hits: Vec<DocSearchHit>,
}

pub struct DocSearchHit {
pub score: f32,
pub doc: DocSearchDocument,
}

#[derive(Clone)]
pub enum DocSearchDocument {
Web(DocSearchWebDocument),
Issue(DocSearchIssueDocument),
}

#[derive(Error, Debug)]
pub enum DocSearchError {
#[error("index not ready")]
NotReady,

#[error(transparent)]
QueryParserError(#[from] tantivy::query::QueryParserError),

#[error(transparent)]
TantivyError(#[from] tantivy::TantivyError),

#[error(transparent)]
Other(#[from] anyhow::Error),
}

#[async_trait]
pub trait DocSearch: Send + Sync {
/// Search docs from underlying index.
///
/// * `source_ids`: Filter documents by source IDs, when empty, search all sources.
async fn search(
&self,
source_ids: &[String],
q: &str,
limit: usize,
) -> Result<DocSearchResponse, DocSearchError>;
}

#[derive(Clone)]
pub struct DocSearchWebDocument {
pub title: String,
pub link: String,
pub snippet: String,
}

#[derive(Clone)]
pub struct DocSearchIssueDocument {
pub title: String,
pub link: String,
pub body: String,
pub closed: bool,
}

pub trait FromTantivyDocument {
fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option<Self>
where
Self: Sized;
}

impl FromTantivyDocument for DocSearchDocument {
fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option<Self> {
let schema = IndexSchema::instance();
let kind = get_json_text_field(doc, schema.field_attributes, structured_doc::fields::KIND);

match kind {
"web" => {
DocSearchWebDocument::from_tantivy_document(doc, chunk).map(DocSearchDocument::Web)
}
"issue" => DocSearchIssueDocument::from_tantivy_document(doc, chunk)
.map(DocSearchDocument::Issue),
_ => None,
}
}
}

impl FromTantivyDocument for DocSearchWebDocument {
fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option<Self> {
let schema = IndexSchema::instance();
let title = get_json_text_field(
doc,
schema.field_attributes,
structured_doc::fields::web::TITLE,
);
let link = get_json_text_field(
doc,
schema.field_attributes,
structured_doc::fields::web::LINK,
);
let snippet = get_json_text_field(
chunk,
schema.field_chunk_attributes,
structured_doc::fields::web::CHUNK_TEXT,
);

Some(Self {
title: title.into(),
link: link.into(),
snippet: snippet.into(),
})
}
}

impl FromTantivyDocument for DocSearchIssueDocument {
fn from_tantivy_document(doc: &TantivyDocument, _: &TantivyDocument) -> Option<Self> {
let schema = IndexSchema::instance();
let title = get_json_text_field(
doc,
schema.field_attributes,
structured_doc::fields::issue::TITLE,
);
let link = get_json_text_field(
doc,
schema.field_attributes,
structured_doc::fields::issue::LINK,
);
let body = get_json_text_field(
doc,
schema.field_attributes,
structured_doc::fields::issue::BODY,
);
let closed = get_json_bool_field(
doc,
schema.field_attributes,
structured_doc::fields::issue::CLOSED,
);
Some(Self {
title: title.into(),
link: link.into(),
body: body.into(),
closed,
})
}
}

fn get_json_field<'a>(
doc: &'a TantivyDocument,
field: schema::Field,
name: &str,
) -> CompactDocValue<'a> {
doc.get_first(field)
.unwrap()
.as_object()
.unwrap()
.find(|(k, _)| *k == name)
.unwrap()
.1
}

fn get_json_bool_field(doc: &TantivyDocument, field: schema::Field, name: &str) -> bool {
get_json_field(doc, field, name).as_bool().unwrap()
}

fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: &str) -> &'a str {
get_json_field(doc, field, name).as_str().unwrap()
}
2 changes: 1 addition & 1 deletion crates/tabby-common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ mod tests {
}

assert!(
matches!(Config::validate_model_config(&config.model.completion), Err(ref e) if true)
matches!(Config::validate_model_config(&config.model.completion), Err(ref _e) if true)
);
assert!(Config::validate_model_config(&config.model.chat).is_ok());
}
Expand Down
5 changes: 0 additions & 5 deletions crates/tabby-common/src/index/doc.rs

This file was deleted.

8 changes: 7 additions & 1 deletion crates/tabby-common/src/index/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pub mod code;
pub mod doc;
pub mod structured_doc;

use std::borrow::Cow;

Expand Down Expand Up @@ -75,6 +75,12 @@ pub const FIELD_SOURCE_ID: &str = "source_id";

pub mod corpus {
pub const CODE: &str = "code";
pub const STRUCTURED_DOC: &str = "structured_doc";

#[deprecated(
since = "0.20.0",
note = "The web corpus is deprecated and will be removed during the version upgrade."
)]
pub const WEB: &str = "web";
}

Expand Down
16 changes: 16 additions & 0 deletions crates/tabby-common/src/index/structured_doc.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
pub mod fields {
pub const KIND: &str = "kind";

pub mod web {
pub const TITLE: &str = "title";
pub const LINK: &str = "link";
pub const CHUNK_TEXT: &str = "chunk_text";
}

pub mod issue {
pub const TITLE: &str = "title";
pub const LINK: &str = "link";
pub const BODY: &str = "body";
pub const CLOSED: &str = "closed";
}
}
6 changes: 3 additions & 3 deletions crates/tabby-index/src/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ impl IndexAttributeBuilder<SourceCode> for CodeBuilder {
json!({})
}

async fn build_chunk_attributes(
async fn build_chunk_attributes<'a>(
&self,
source_code: &SourceCode,
) -> BoxStream<JoinHandle<(Vec<String>, serde_json::Value)>> {
source_code: &'a SourceCode,
) -> BoxStream<'a, JoinHandle<(Vec<String>, serde_json::Value)>> {
let text = match source_code.read_content() {
Ok(content) => content,
Err(e) => {
Expand Down
99 changes: 0 additions & 99 deletions crates/tabby-index/src/doc/mod.rs

This file was deleted.

Loading

0 comments on commit a52c4e6

Please sign in to comment.