use crate::error::DBError; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::path::PathBuf; use std::sync::{Arc, RwLock}; use tantivy::{ collector::TopDocs, directory::MmapDirectory, query::{BooleanQuery, Occur, Query, QueryParser, TermQuery}, schema::{ DateOptions, Field, IndexRecordOption, NumericOptions, Schema, TextFieldIndexing, TextOptions, STORED, STRING, }, tokenizer::TokenizerManager, DateTime, Index, IndexReader, IndexWriter, TantivyDocument, Term, }; use tantivy::schema::Value; #[derive(Debug, Clone, Serialize, Deserialize)] pub enum FieldDef { Text { stored: bool, indexed: bool, tokenized: bool, fast: bool, }, Numeric { stored: bool, indexed: bool, fast: bool, precision: NumericType, }, Tag { stored: bool, separator: String, case_sensitive: bool, }, Geo { stored: bool, }, } #[derive(Debug, Clone, Serialize, Deserialize)] pub enum NumericType { I64, U64, F64, Date, } pub struct IndexSchema { schema: Schema, fields: HashMap, default_search_fields: Vec, } pub struct TantivySearch { index: Index, writer: Arc>, reader: IndexReader, index_schema: IndexSchema, name: String, config: IndexConfig, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct IndexConfig { pub language: String, pub stopwords: Vec, pub stemming: bool, pub max_doc_count: Option, pub default_score: f64, } impl Default for IndexConfig { fn default() -> Self { IndexConfig { language: "english".to_string(), stopwords: vec![], stemming: true, max_doc_count: None, default_score: 1.0, } } } impl TantivySearch { pub fn new_with_schema( base_path: PathBuf, name: String, field_definitions: Vec<(String, FieldDef)>, config: Option, ) -> Result { let index_path = base_path.join(&name); std::fs::create_dir_all(&index_path) .map_err(|e| DBError(format!("Failed to create index dir: {}", e)))?; // Build schema from field definitions let mut schema_builder = Schema::builder(); let mut fields = HashMap::new(); let mut default_search_fields = Vec::new(); // Always add a document ID field let id_field = schema_builder.add_text_field("_id", STRING | STORED); fields.insert( "_id".to_string(), ( id_field, FieldDef::Text { stored: true, indexed: true, tokenized: false, fast: false, }, ), ); // Add user-defined fields for (field_name, field_def) in field_definitions { let field = match &field_def { FieldDef::Text { stored, indexed, tokenized, fast: _fast, } => { let mut text_options = TextOptions::default(); if *stored { text_options = text_options.set_stored(); } if *indexed { let indexing_options = if *tokenized { TextFieldIndexing::default() .set_tokenizer("default") .set_index_option(IndexRecordOption::WithFreqsAndPositions) } else { TextFieldIndexing::default() .set_tokenizer("raw") .set_index_option(IndexRecordOption::Basic) }; text_options = text_options.set_indexing_options(indexing_options); let f = schema_builder.add_text_field(&field_name, text_options); if *tokenized { default_search_fields.push(f); } f } else { schema_builder.add_text_field(&field_name, text_options) } } FieldDef::Numeric { stored, indexed, fast, precision, } => match precision { NumericType::I64 => { let mut opts = NumericOptions::default(); if *stored { opts = opts.set_stored(); } if *indexed { opts = opts.set_indexed(); } if *fast { opts = opts.set_fast(); } schema_builder.add_i64_field(&field_name, opts) } NumericType::U64 => { let mut opts = NumericOptions::default(); if *stored { opts = opts.set_stored(); } if *indexed { opts = opts.set_indexed(); } if *fast { opts = opts.set_fast(); } schema_builder.add_u64_field(&field_name, opts) } NumericType::F64 => { let mut opts = NumericOptions::default(); if *stored { opts = opts.set_stored(); } if *indexed { opts = opts.set_indexed(); } if *fast { opts = opts.set_fast(); } schema_builder.add_f64_field(&field_name, opts) } NumericType::Date => { let mut opts = DateOptions::default(); if *stored { opts = opts.set_stored(); } if *indexed { opts = opts.set_indexed(); } if *fast { opts = opts.set_fast(); } schema_builder.add_date_field(&field_name, opts) } }, FieldDef::Tag { stored, separator: _, case_sensitive: _, } => { let mut text_options = TextOptions::default(); if *stored { text_options = text_options.set_stored(); } text_options = text_options.set_indexing_options( TextFieldIndexing::default() .set_tokenizer("raw") .set_index_option(IndexRecordOption::Basic), ); schema_builder.add_text_field(&field_name, text_options) } FieldDef::Geo { stored } => { // For now, store as two f64 fields for lat/lon let mut opts = NumericOptions::default(); if *stored { opts = opts.set_stored(); } opts = opts.set_indexed().set_fast(); let lat_field = schema_builder.add_f64_field(&format!("{}_lat", field_name), opts.clone()); let lon_field = schema_builder.add_f64_field(&format!("{}_lon", field_name), opts); fields.insert( format!("{}_lat", field_name), ( lat_field, FieldDef::Numeric { stored: *stored, indexed: true, fast: true, precision: NumericType::F64, }, ), ); fields.insert( format!("{}_lon", field_name), ( lon_field, FieldDef::Numeric { stored: *stored, indexed: true, fast: true, precision: NumericType::F64, }, ), ); continue; // Skip adding the geo field itself } }; fields.insert(field_name.clone(), (field, field_def)); } let schema = schema_builder.build(); let index_schema = IndexSchema { schema: schema.clone(), fields, default_search_fields, }; // Create or open index let dir = MmapDirectory::open(&index_path) .map_err(|e| DBError(format!("Failed to open index directory: {}", e)))?; let mut index = Index::open_or_create(dir, schema).map_err(|e| DBError(format!("Failed to create index: {}", e)))?; // Configure tokenizers let tokenizer_manager = TokenizerManager::default(); index.set_tokenizers(tokenizer_manager); let writer = index .writer(15_000_000) .map_err(|e| DBError(format!("Failed to create index writer: {}", e)))?; let reader = index .reader() .map_err(|e| DBError(format!("Failed to create reader: {}", e)))?; let config = config.unwrap_or_default(); Ok(TantivySearch { index, writer: Arc::new(RwLock::new(writer)), reader, index_schema, name, config, }) } pub fn add_document_with_fields( &self, doc_id: &str, fields: HashMap, ) -> Result<(), DBError> { let mut writer = self .writer .write() .map_err(|e| DBError(format!("Failed to acquire writer lock: {}", e)))?; // Delete existing document with same ID if let Some((id_field, _)) = self.index_schema.fields.get("_id") { writer.delete_term(Term::from_field_text(*id_field, doc_id)); } // Create new document let mut doc = tantivy::doc!(); // Add document ID if let Some((id_field, _)) = self.index_schema.fields.get("_id") { doc.add_text(*id_field, doc_id); } // Add other fields based on schema for (field_name, field_value) in fields { if let Some((field, field_def)) = self.index_schema.fields.get(&field_name) { match field_def { FieldDef::Text { .. } => { doc.add_text(*field, &field_value); } FieldDef::Numeric { precision, .. } => match precision { NumericType::I64 => { if let Ok(v) = field_value.parse::() { doc.add_i64(*field, v); } } NumericType::U64 => { if let Ok(v) = field_value.parse::() { doc.add_u64(*field, v); } } NumericType::F64 => { if let Ok(v) = field_value.parse::() { doc.add_f64(*field, v); } } NumericType::Date => { if let Ok(v) = field_value.parse::() { doc.add_date(*field, DateTime::from_timestamp_millis(v)); } } }, FieldDef::Tag { separator, case_sensitive, .. } => { let tags = if !case_sensitive { field_value.to_lowercase() } else { field_value.clone() }; for tag in tags.split(separator.as_str()) { doc.add_text(*field, tag.trim()); } } FieldDef::Geo { .. } => { let parts: Vec<&str> = field_value.split(',').collect(); if parts.len() == 2 { if let (Ok(lat), Ok(lon)) = (parts[0].parse::(), parts[1].parse::()) { if let Some((lat_field, _)) = self.index_schema.fields.get(&format!("{}_lat", field_name)) { doc.add_f64(*lat_field, lat); } if let Some((lon_field, _)) = self.index_schema.fields.get(&format!("{}_lon", field_name)) { doc.add_f64(*lon_field, lon); } } } } } } } writer .add_document(doc) .map_err(|e| DBError(format!("Failed to add document: {}", e)))?; writer .commit() .map_err(|e| DBError(format!("Failed to commit: {}", e)))?; // Make new documents visible to searches self.reader .reload() .map_err(|e| DBError(format!("Failed to reload reader: {}", e)))?; Ok(()) } pub fn search_with_options( &self, query_str: &str, options: SearchOptions, ) -> Result { // Ensure reader is up to date with latest commits self.reader .reload() .map_err(|e| DBError(format!("Failed to reload reader: {}", e)))?; let searcher = self.reader.searcher(); // Ensure we have searchable fields if self.index_schema.default_search_fields.is_empty() { return Err(DBError("No searchable fields defined in schema".to_string())); } // Parse query based on search fields let query_parser = QueryParser::for_index( &self.index, self.index_schema.default_search_fields.clone(), ); let parsed_query = query_parser .parse_query(query_str) .map_err(|e| DBError(format!("Failed to parse query: {}", e)))?; let mut clauses: Vec<(Occur, Box)> = vec![(Occur::Must, parsed_query)]; // Apply filters if any for filter in options.filters { if let Some((field, field_def)) = self.index_schema.fields.get(&filter.field) { match filter.filter_type { FilterType::Equals(value) => { match field_def { FieldDef::Text { .. } | FieldDef::Tag { .. } => { let term_query = TermQuery::new(Term::from_field_text(*field, &value), IndexRecordOption::Basic); clauses.push((Occur::Must, Box::new(term_query))); } FieldDef::Numeric { precision, .. } => { // Equals on numeric fields: parse to the right numeric type and use term query match precision { NumericType::I64 => { if let Ok(v) = value.parse::() { let term = Term::from_field_i64(*field, v); let tq = TermQuery::new(term, IndexRecordOption::Basic); clauses.push((Occur::Must, Box::new(tq))); } } NumericType::U64 => { if let Ok(v) = value.parse::() { let term = Term::from_field_u64(*field, v); let tq = TermQuery::new(term, IndexRecordOption::Basic); clauses.push((Occur::Must, Box::new(tq))); } } NumericType::F64 => { if let Ok(v) = value.parse::() { let term = Term::from_field_f64(*field, v); let tq = TermQuery::new(term, IndexRecordOption::Basic); clauses.push((Occur::Must, Box::new(tq))); } } NumericType::Date => { if let Ok(v) = value.parse::() { let dt = DateTime::from_timestamp_millis(v); let term = Term::from_field_date(*field, dt); let tq = TermQuery::new(term, IndexRecordOption::Basic); clauses.push((Occur::Must, Box::new(tq))); } } } } FieldDef::Geo { .. } => { // Geo equals isn't supported in this simplified version } } } FilterType::Range { .. } => { // TODO: Implement numeric range queries by building a RangeQuery per type } FilterType::InSet(values) => { // OR across values let mut sub_clauses: Vec<(Occur, Box)> = vec![]; for value in values { let term_query = TermQuery::new( Term::from_field_text(*field, &value), IndexRecordOption::Basic, ); sub_clauses.push((Occur::Should, Box::new(term_query))); } clauses.push((Occur::Must, Box::new(BooleanQuery::new(sub_clauses)))); } } } } let final_query: Box = if clauses.len() == 1 { clauses.pop().unwrap().1 } else { Box::new(BooleanQuery::new(clauses)) }; // Execute search let top_docs = searcher .search(&*final_query, &TopDocs::with_limit(options.limit + options.offset)) .map_err(|e| DBError(format!("Search failed: {}", e)))?; let total_hits = top_docs.len(); let mut documents = Vec::new(); for (score, doc_address) in top_docs.into_iter().skip(options.offset).take(options.limit) { let retrieved_doc: TantivyDocument = searcher .doc(doc_address) .map_err(|e| DBError(format!("Failed to retrieve doc: {}", e)))?; let mut doc_fields = HashMap::new(); // Extract stored fields (or synthesize) for (field_name, (field, field_def)) in &self.index_schema.fields { match field_def { FieldDef::Text { stored, .. } | FieldDef::Tag { stored, .. } => { if *stored { if let Some(value) = retrieved_doc.get_first(*field) { if let Some(text) = value.as_str() { doc_fields.insert(field_name.clone(), text.to_string()); } } } } FieldDef::Numeric { stored, precision, .. } => { if *stored { let value_str = match precision { NumericType::I64 => retrieved_doc .get_first(*field) .and_then(|v| v.as_i64()) .map(|v| v.to_string()), NumericType::U64 => retrieved_doc .get_first(*field) .and_then(|v| v.as_u64()) .map(|v| v.to_string()), NumericType::F64 => retrieved_doc .get_first(*field) .and_then(|v| v.as_f64()) .map(|v| v.to_string()), NumericType::Date => retrieved_doc .get_first(*field) .and_then(|v| v.as_datetime()) .map(|v| v.into_timestamp_millis().to_string()), }; if let Some(v) = value_str { doc_fields.insert(field_name.clone(), v); } } } FieldDef::Geo { stored } => { if *stored { let lat_field = self .index_schema .fields .get(&format!("{}_lat", field_name)) .unwrap() .0; let lon_field = self .index_schema .fields .get(&format!("{}_lon", field_name)) .unwrap() .0; let lat = retrieved_doc.get_first(lat_field).and_then(|v| v.as_f64()); let lon = retrieved_doc.get_first(lon_field).and_then(|v| v.as_f64()); if let (Some(lat), Some(lon)) = (lat, lon) { doc_fields.insert(field_name.clone(), format!("{},{}", lat, lon)); } } } } } documents.push(SearchDocument { fields: doc_fields, score, }); } Ok(SearchResults { total: total_hits, documents, }) } pub fn get_info(&self) -> Result { let searcher = self.reader.searcher(); let num_docs = searcher.num_docs(); let fields_info: Vec = self .index_schema .fields .iter() .map(|(name, (_, def))| FieldInfo { name: name.clone(), field_type: format!("{:?}", def), }) .collect(); Ok(IndexInfo { name: self.name.clone(), num_docs, fields: fields_info, config: self.config.clone(), }) } /// Delete a document by its _id term. Returns true if the document existed before deletion. pub fn delete_document_by_id(&self, doc_id: &str) -> Result { // Determine existence by running a tiny term query let existed = if let Some((id_field, _)) = self.index_schema.fields.get("_id") { let term = Term::from_field_text(*id_field, doc_id); let searcher = self.reader.searcher(); let tq = TermQuery::new(term.clone(), IndexRecordOption::Basic); let hits = searcher .search(&tq, &TopDocs::with_limit(1)) .map_err(|e| DBError(format!("Failed to search for existing doc: {}", e)))?; !hits.is_empty() } else { false }; // Perform deletion and commit let mut writer = self .writer .write() .map_err(|e| DBError(format!("Failed to acquire writer lock: {}", e)))?; if let Some((id_field, _)) = self.index_schema.fields.get("_id") { writer.delete_term(Term::from_field_text(*id_field, doc_id)); } writer .commit() .map_err(|e| DBError(format!("Failed to commit delete: {}", e)))?; // Refresh reader to observe deletion self.reader .reload() .map_err(|e| DBError(format!("Failed to reload reader: {}", e)))?; Ok(existed) } } #[derive(Debug, Clone)] pub struct SearchOptions { pub limit: usize, pub offset: usize, pub filters: Vec, pub sort_by: Option, pub return_fields: Option>, pub highlight: bool, } impl Default for SearchOptions { fn default() -> Self { SearchOptions { limit: 10, offset: 0, filters: vec![], sort_by: None, return_fields: None, highlight: false, } } } #[derive(Debug, Clone)] pub struct Filter { pub field: String, pub filter_type: FilterType, } #[derive(Debug, Clone)] pub enum FilterType { Equals(String), Range { min: String, max: String }, InSet(Vec), } #[derive(Debug)] pub struct SearchResults { pub total: usize, pub documents: Vec, } #[derive(Debug)] pub struct SearchDocument { pub fields: HashMap, pub score: f32, } #[derive(Debug, Serialize, Deserialize)] pub struct IndexInfo { pub name: String, pub num_docs: u64, pub fields: Vec, pub config: IndexConfig, } #[derive(Debug, Serialize, Deserialize)] pub struct FieldInfo { pub name: String, pub field_type: String, }