use tantivy::{ collector::TopDocs, directory::MmapDirectory, query::{QueryParser, BooleanQuery, Query, TermQuery, Occur}, schema::{Schema, Field, TextOptions, TextFieldIndexing, STORED, STRING, Value}, Index, IndexWriter, IndexReader, ReloadPolicy, Term, DateTime, TantivyDocument, tokenizer::{TokenizerManager}, }; use std::path::PathBuf; use std::sync::{Arc, RwLock}; use std::collections::HashMap; use crate::error::DBError; use serde::{Serialize, Deserialize}; #[derive(Debug, Clone, Serialize, Deserialize)] pub enum FieldDef { Text { stored: bool, indexed: bool, tokenized: bool, fast: bool, }, Numeric { stored: bool, indexed: bool, fast: bool, precision: NumericType, }, Tag { stored: bool, separator: String, case_sensitive: bool, }, Geo { stored: bool, }, } #[derive(Debug, Clone, Serialize, Deserialize)] pub enum NumericType { I64, U64, F64, Date, } pub struct IndexSchema { schema: Schema, fields: HashMap, default_search_fields: Vec, } pub struct TantivySearch { index: Index, writer: Arc>, reader: IndexReader, index_schema: IndexSchema, name: String, config: IndexConfig, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct IndexConfig { pub language: String, pub stopwords: Vec, pub stemming: bool, pub max_doc_count: Option, pub default_score: f64, } impl Default for IndexConfig { fn default() -> Self { IndexConfig { language: "english".to_string(), stopwords: vec![], stemming: true, max_doc_count: None, default_score: 1.0, } } } impl TantivySearch { pub fn new_with_schema( base_path: PathBuf, name: String, field_definitions: Vec<(String, FieldDef)>, config: Option, ) -> Result { let index_path = base_path.join(&name); std::fs::create_dir_all(&index_path) .map_err(|e| DBError(format!("Failed to create index dir: {}", e)))?; // Build schema from field definitions let mut schema_builder = Schema::builder(); let mut fields = HashMap::new(); let mut default_search_fields = Vec::new(); // Always add a document ID field let id_field = schema_builder.add_text_field("_id", STRING | STORED); fields.insert("_id".to_string(), (id_field, FieldDef::Text { stored: true, indexed: true, tokenized: false, fast: false, })); // Add user-defined fields for (field_name, field_def) in field_definitions { let field = match &field_def { FieldDef::Text { stored, indexed, tokenized, fast: _fast } => { let mut text_options = TextOptions::default(); if *stored { text_options = text_options.set_stored(); } if *indexed { let indexing_options = if *tokenized { TextFieldIndexing::default() .set_tokenizer("default") .set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions) } else { TextFieldIndexing::default() .set_tokenizer("raw") .set_index_option(tantivy::schema::IndexRecordOption::Basic) }; text_options = text_options.set_indexing_options(indexing_options); let f = schema_builder.add_text_field(&field_name, text_options); if *tokenized { default_search_fields.push(f); } f } else { schema_builder.add_text_field(&field_name, text_options) } } FieldDef::Numeric { stored, indexed, fast, precision } => { match precision { NumericType::I64 => { let mut opts = tantivy::schema::NumericOptions::default(); if *stored { opts = opts.set_stored(); } if *indexed { opts = opts.set_indexed(); } if *fast { opts = opts.set_fast(); } schema_builder.add_i64_field(&field_name, opts) } NumericType::U64 => { let mut opts = tantivy::schema::NumericOptions::default(); if *stored { opts = opts.set_stored(); } if *indexed { opts = opts.set_indexed(); } if *fast { opts = opts.set_fast(); } schema_builder.add_u64_field(&field_name, opts) } NumericType::F64 => { let mut opts = tantivy::schema::NumericOptions::default(); if *stored { opts = opts.set_stored(); } if *indexed { opts = opts.set_indexed(); } if *fast { opts = opts.set_fast(); } schema_builder.add_f64_field(&field_name, opts) } NumericType::Date => { let mut opts = tantivy::schema::DateOptions::default(); if *stored { opts = opts.set_stored(); } if *indexed { opts = opts.set_indexed(); } if *fast { opts = opts.set_fast(); } schema_builder.add_date_field(&field_name, opts) } } } FieldDef::Tag { stored, separator: _, case_sensitive: _ } => { let mut text_options = TextOptions::default(); if *stored { text_options = text_options.set_stored(); } text_options = text_options.set_indexing_options( TextFieldIndexing::default() .set_tokenizer("raw") .set_index_option(tantivy::schema::IndexRecordOption::Basic) ); schema_builder.add_text_field(&field_name, text_options) } FieldDef::Geo { stored } => { // For now, store as two f64 fields for lat/lon let mut opts = tantivy::schema::NumericOptions::default(); if *stored { opts = opts.set_stored(); } opts = opts.set_indexed().set_fast(); let lat_field = schema_builder.add_f64_field(&format!("{}_lat", field_name), opts.clone()); let lon_field = schema_builder.add_f64_field(&format!("{}_lon", field_name), opts); fields.insert(format!("{}_lat", field_name), (lat_field, FieldDef::Numeric { stored: *stored, indexed: true, fast: true, precision: NumericType::F64, })); fields.insert(format!("{}_lon", field_name), (lon_field, FieldDef::Numeric { stored: *stored, indexed: true, fast: true, precision: NumericType::F64, })); continue; // Skip adding the geo field itself } }; fields.insert(field_name.clone(), (field, field_def)); } let schema = schema_builder.build(); let index_schema = IndexSchema { schema: schema.clone(), fields, default_search_fields, }; // Create or open index let dir = MmapDirectory::open(&index_path) .map_err(|e| DBError(format!("Failed to open index directory: {}", e)))?; let mut index = Index::open_or_create(dir, schema) .map_err(|e| DBError(format!("Failed to create index: {}", e)))?; // Configure tokenizers let tokenizer_manager = TokenizerManager::default(); index.set_tokenizers(tokenizer_manager); let writer = index.writer(1_000_000) .map_err(|e| DBError(format!("Failed to create index writer: {}", e)))?; let reader = index .reader_builder() .reload_policy(ReloadPolicy::OnCommitWithDelay) .try_into() .map_err(|e| DBError(format!("Failed to create reader: {}", e)))?; let config = config.unwrap_or_default(); Ok(TantivySearch { index, writer: Arc::new(RwLock::new(writer)), reader, index_schema, name, config, }) } pub fn add_document_with_fields( &self, doc_id: &str, fields: HashMap, ) -> Result<(), DBError> { let mut writer = self.writer.write() .map_err(|e| DBError(format!("Failed to acquire writer lock: {}", e)))?; // Delete existing document with same ID if let Some((id_field, _)) = self.index_schema.fields.get("_id") { writer.delete_term(Term::from_field_text(*id_field, doc_id)); } // Create new document let mut doc = tantivy::doc!(); // Add document ID if let Some((id_field, _)) = self.index_schema.fields.get("_id") { doc.add_text(*id_field, doc_id); } // Add other fields based on schema for (field_name, field_value) in fields { if let Some((field, field_def)) = self.index_schema.fields.get(&field_name) { match field_def { FieldDef::Text { .. } => { doc.add_text(*field, &field_value); } FieldDef::Numeric { precision, .. } => { match precision { NumericType::I64 => { if let Ok(v) = field_value.parse::() { doc.add_i64(*field, v); } } NumericType::U64 => { if let Ok(v) = field_value.parse::() { doc.add_u64(*field, v); } } NumericType::F64 => { if let Ok(v) = field_value.parse::() { doc.add_f64(*field, v); } } NumericType::Date => { if let Ok(v) = field_value.parse::() { doc.add_date(*field, DateTime::from_timestamp_millis(v)); } } } } FieldDef::Tag { separator, case_sensitive, .. } => { let tags = if !case_sensitive { field_value.to_lowercase() } else { field_value.clone() }; // Store tags as separate terms for efficient filtering for tag in tags.split(separator.as_str()) { doc.add_text(*field, tag.trim()); } } FieldDef::Geo { .. } => { // Parse "lat,lon" format let parts: Vec<&str> = field_value.split(',').collect(); if parts.len() == 2 { if let (Ok(lat), Ok(lon)) = (parts[0].parse::(), parts[1].parse::()) { if let Some((lat_field, _)) = self.index_schema.fields.get(&format!("{}_lat", field_name)) { doc.add_f64(*lat_field, lat); } if let Some((lon_field, _)) = self.index_schema.fields.get(&format!("{}_lon", field_name)) { doc.add_f64(*lon_field, lon); } } } } } } } writer.add_document(doc).map_err(|e| DBError(format!("Failed to add document: {}", e)))?; writer.commit() .map_err(|e| DBError(format!("Failed to commit: {}", e)))?; Ok(()) } pub fn search_with_options( &self, query_str: &str, options: SearchOptions, ) -> Result { let searcher = self.reader.searcher(); // Parse query based on search fields let query: Box = if self.index_schema.default_search_fields.is_empty() { return Err(DBError("No searchable fields defined in schema".to_string())); } else { let query_parser = QueryParser::for_index( &self.index, self.index_schema.default_search_fields.clone(), ); Box::new(query_parser.parse_query(query_str) .map_err(|e| DBError(format!("Failed to parse query: {}", e)))?) }; // Apply filters if any let final_query = if !options.filters.is_empty() { let mut clauses: Vec<(Occur, Box)> = vec![(Occur::Must, query)]; // Add filters for filter in options.filters { if let Some((field, _)) = self.index_schema.fields.get(&filter.field) { match filter.filter_type { FilterType::Equals(value) => { let term_query = TermQuery::new( Term::from_field_text(*field, &value), tantivy::schema::IndexRecordOption::Basic, ); clauses.push((Occur::Must, Box::new(term_query))); } FilterType::Range { min: _, max: _ } => { // Would need numeric field handling here // Simplified for now } FilterType::InSet(values) => { let mut sub_clauses: Vec<(Occur, Box)> = vec![]; for value in values { let term_query = TermQuery::new( Term::from_field_text(*field, &value), tantivy::schema::IndexRecordOption::Basic, ); sub_clauses.push((Occur::Should, Box::new(term_query))); } clauses.push((Occur::Must, Box::new(BooleanQuery::new(sub_clauses)))); } } } } Box::new(BooleanQuery::new(clauses)) } else { query }; // Execute search let top_docs = searcher.search( &*final_query, &TopDocs::with_limit(options.limit + options.offset) ).map_err(|e| DBError(format!("Search failed: {}", e)))?; let total_hits = top_docs.len(); let mut documents = Vec::new(); for (score, doc_address) in top_docs.iter().skip(options.offset).take(options.limit) { let retrieved_doc: TantivyDocument = searcher.doc(*doc_address) .map_err(|e| DBError(format!("Failed to retrieve doc: {}", e)))?; let mut doc_fields = HashMap::new(); // Extract all stored fields for (field_name, (field, field_def)) in &self.index_schema.fields { match field_def { FieldDef::Text { stored, .. } | FieldDef::Tag { stored, .. } => { if *stored { if let Some(value) = retrieved_doc.get_first(*field) { if let Some(text) = value.as_str() { doc_fields.insert(field_name.clone(), text.to_string()); } } } } FieldDef::Numeric { stored, precision, .. } => { if *stored { let value_str = match precision { NumericType::I64 => { retrieved_doc.get_first(*field) .and_then(|v| v.as_i64()) .map(|v| v.to_string()) } NumericType::U64 => { retrieved_doc.get_first(*field) .and_then(|v| v.as_u64()) .map(|v| v.to_string()) } NumericType::F64 => { retrieved_doc.get_first(*field) .and_then(|v| v.as_f64()) .map(|v| v.to_string()) } NumericType::Date => { retrieved_doc.get_first(*field) .and_then(|v| v.as_datetime()) .map(|v| v.into_timestamp_millis().to_string()) } }; if let Some(v) = value_str { doc_fields.insert(field_name.clone(), v); } } } FieldDef::Geo { stored } => { if *stored { let lat_field = self.index_schema.fields.get(&format!("{}_lat", field_name)).unwrap().0; let lon_field = self.index_schema.fields.get(&format!("{}_lon", field_name)).unwrap().0; let lat = retrieved_doc.get_first(lat_field).and_then(|v| v.as_f64()); let lon = retrieved_doc.get_first(lon_field).and_then(|v| v.as_f64()); if let (Some(lat), Some(lon)) = (lat, lon) { doc_fields.insert(field_name.clone(), format!("{},{}", lat, lon)); } } } } } documents.push(SearchDocument { fields: doc_fields, score: *score, }); } Ok(SearchResults { total: total_hits, documents, }) } pub fn get_info(&self) -> Result { let searcher = self.reader.searcher(); let num_docs = searcher.num_docs(); let fields_info: Vec = self.index_schema.fields.iter().map(|(name, (_, def))| { FieldInfo { name: name.clone(), field_type: format!("{:?}", def), } }).collect(); Ok(IndexInfo { name: self.name.clone(), num_docs, fields: fields_info, config: self.config.clone(), }) } } #[derive(Debug, Clone)] pub struct SearchOptions { pub limit: usize, pub offset: usize, pub filters: Vec, pub sort_by: Option, pub return_fields: Option>, pub highlight: bool, } impl Default for SearchOptions { fn default() -> Self { SearchOptions { limit: 10, offset: 0, filters: vec![], sort_by: None, return_fields: None, highlight: false, } } } #[derive(Debug, Clone)] pub struct Filter { pub field: String, pub filter_type: FilterType, } #[derive(Debug, Clone)] pub enum FilterType { Equals(String), Range { min: String, max: String }, InSet(Vec), } #[derive(Debug)] pub struct SearchResults { pub total: usize, pub documents: Vec, } #[derive(Debug)] pub struct SearchDocument { pub fields: HashMap, pub score: f32, } #[derive(Debug, Serialize, Deserialize)] pub struct IndexInfo { pub name: String, pub num_docs: u64, pub fields: Vec, pub config: IndexConfig, } #[derive(Debug, Serialize, Deserialize)] pub struct FieldInfo { pub name: String, pub field_type: String, }