diff --git a/herodb/docs/cmds.md b/herodb/docs/cmds.md index fa85ff4..78a6e78 100644 --- a/herodb/docs/cmds.md +++ b/herodb/docs/cmds.md @@ -70,6 +70,15 @@ MULTI/EXEC/DISCARD | ✅ | ❌ | Only supported in redb | **Encryption** | | | | Data-at-rest encryption | ✅ | ✅ | Both support [age](age.tech) encryption | AGE commands | ✅ | ✅ | Both support AGE crypto commands | +**Full-Text Search** | | | | +FT.CREATE | ✅ | ✅ | Create search index with schema | +FT.ADD | ✅ | ✅ | Add document to search index | +FT.SEARCH | ✅ | ✅ | Search documents with query | +FT.DEL | ✅ | ✅ | Delete document from index | +FT.INFO | ✅ | ✅ | Get index information | +FT.DROP | ✅ | ✅ | Drop search index | +FT.ALTER | ✅ | ✅ | Alter index schema | +FT.AGGREGATE | ✅ | ✅ | Aggregate search results | ### Performance Considerations diff --git a/herodb/docs/search.md b/herodb/docs/search.md new file mode 100644 index 0000000..27cdfb3 --- /dev/null +++ b/herodb/docs/search.md @@ -0,0 +1,397 @@ +# Full-Text Search with Tantivy + +HeroDB includes powerful full-text search capabilities powered by [Tantivy](https://github.com/quickwit-oss/tantivy), a fast full-text search engine library written in Rust. This provides Redis-compatible search commands similar to RediSearch. + +## Overview + +The search functionality allows you to: +- Create search indexes with custom schemas +- Index documents with multiple field types +- Perform complex queries with filters +- Support for text, numeric, date, and geographic data +- Real-time search with high performance + +## Search Commands + +### FT.CREATE - Create Search Index + +Create a new search index with a defined schema. + +```bash +FT.CREATE index_name SCHEMA field_name field_type [options] [field_name field_type [options] ...] +``` + +**Field Types:** +- `TEXT` - Full-text searchable text fields +- `NUMERIC` - Numeric fields (integers, floats) +- `TAG` - Tag fields for exact matching +- `GEO` - Geographic coordinates (lat,lon) +- `DATE` - Date/timestamp fields + +**Field Options:** +- `STORED` - Store field value for retrieval +- `INDEXED` - Make field searchable +- `TOKENIZED` - Enable tokenization for text fields +- `FAST` - Enable fast access for numeric fields + +**Example:** +```bash +# Create a product search index +FT.CREATE products SCHEMA + title TEXT STORED INDEXED TOKENIZED + description TEXT STORED INDEXED TOKENIZED + price NUMERIC STORED INDEXED FAST + category TAG STORED + location GEO STORED + created_date DATE STORED INDEXED +``` + +### FT.ADD - Add Document to Index + +Add a document to a search index. + +```bash +FT.ADD index_name doc_id [SCORE score] FIELDS field_name field_value [field_name field_value ...] +``` + +**Example:** +```bash +# Add a product document +FT.ADD products product:1 SCORE 1.0 FIELDS + title "Wireless Headphones" + description "High-quality wireless headphones with noise cancellation" + price 199.99 + category "electronics" + location "37.7749,-122.4194" + created_date 1640995200000 +``` + +### FT.SEARCH - Search Documents + +Search for documents in an index. + +```bash +FT.SEARCH index_name query [LIMIT offset count] [FILTER field min max] [RETURN field [field ...]] +``` + +**Query Syntax:** +- Simple terms: `wireless headphones` +- Phrase queries: `"noise cancellation"` +- Field-specific: `title:wireless` +- Boolean operators: `wireless AND headphones` +- Wildcards: `head*` + +**Examples:** +```bash +# Simple text search +FT.SEARCH products "wireless headphones" + +# Search with filters +FT.SEARCH products "headphones" FILTER price 100 300 LIMIT 0 10 + +# Field-specific search +FT.SEARCH products "title:wireless AND category:electronics" + +# Return specific fields only +FT.SEARCH products "*" RETURN title price +``` + +### FT.DEL - Delete Document + +Remove a document from the search index. + +```bash +FT.DEL index_name doc_id +``` + +**Example:** +```bash +FT.DEL products product:1 +``` + +### FT.INFO - Get Index Information + +Get information about a search index. + +```bash +FT.INFO index_name +``` + +**Returns:** +- Index name and document count +- Field definitions and types +- Index configuration + +**Example:** +```bash +FT.INFO products +``` + +### FT.DROP - Drop Index + +Delete an entire search index. + +```bash +FT.DROP index_name +``` + +**Example:** +```bash +FT.DROP products +``` + +### FT.ALTER - Alter Index Schema + +Add new fields to an existing index. + +```bash +FT.ALTER index_name SCHEMA ADD field_name field_type [options] +``` + +**Example:** +```bash +FT.ALTER products SCHEMA ADD brand TAG STORED +``` + +### FT.AGGREGATE - Aggregate Search Results + +Perform aggregations on search results. + +```bash +FT.AGGREGATE index_name query [GROUPBY field] [REDUCE function field AS alias] +``` + +**Example:** +```bash +# Group products by category and count +FT.AGGREGATE products "*" GROUPBY category REDUCE COUNT 0 AS count +``` + +## Field Types in Detail + +### TEXT Fields +- **Purpose**: Full-text search on natural language content +- **Features**: Tokenization, stemming, stop-word removal +- **Options**: `STORED`, `INDEXED`, `TOKENIZED` +- **Example**: Product titles, descriptions, content + +### NUMERIC Fields +- **Purpose**: Numeric data for range queries and sorting +- **Types**: I64, U64, F64 +- **Options**: `STORED`, `INDEXED`, `FAST` +- **Example**: Prices, quantities, ratings + +### TAG Fields +- **Purpose**: Exact-match categorical data +- **Features**: No tokenization, exact string matching +- **Options**: `STORED`, case sensitivity control +- **Example**: Categories, brands, status values + +### GEO Fields +- **Purpose**: Geographic coordinates +- **Format**: "latitude,longitude" (e.g., "37.7749,-122.4194") +- **Features**: Geographic distance queries +- **Options**: `STORED` + +### DATE Fields +- **Purpose**: Timestamp and date data +- **Format**: Unix timestamp in milliseconds +- **Features**: Range queries, temporal filtering +- **Options**: `STORED`, `INDEXED`, `FAST` + +## Search Query Syntax + +### Basic Queries +```bash +# Single term +FT.SEARCH products "wireless" + +# Multiple terms (AND by default) +FT.SEARCH products "wireless headphones" + +# Phrase query +FT.SEARCH products "\"noise cancellation\"" +``` + +### Field-Specific Queries +```bash +# Search in specific field +FT.SEARCH products "title:wireless" + +# Multiple field queries +FT.SEARCH products "title:wireless AND description:bluetooth" +``` + +### Boolean Operators +```bash +# AND operator +FT.SEARCH products "wireless AND headphones" + +# OR operator +FT.SEARCH products "wireless OR bluetooth" + +# NOT operator +FT.SEARCH products "headphones NOT wired" +``` + +### Wildcards and Fuzzy Search +```bash +# Wildcard search +FT.SEARCH products "head*" + +# Fuzzy search (approximate matching) +FT.SEARCH products "%headphone%" +``` + +### Range Queries +```bash +# Numeric range in query +FT.SEARCH products "@price:[100 300]" + +# Date range +FT.SEARCH products "@created_date:[1640995200000 1672531200000]" +``` + +## Filtering and Sorting + +### FILTER Clause +```bash +# Numeric filter +FT.SEARCH products "headphones" FILTER price 100 300 + +# Multiple filters +FT.SEARCH products "*" FILTER price 100 500 FILTER rating 4 5 +``` + +### LIMIT Clause +```bash +# Pagination +FT.SEARCH products "wireless" LIMIT 0 10 # First 10 results +FT.SEARCH products "wireless" LIMIT 10 10 # Next 10 results +``` + +### RETURN Clause +```bash +# Return specific fields +FT.SEARCH products "*" RETURN title price + +# Return all stored fields (default) +FT.SEARCH products "*" +``` + +## Performance Considerations + +### Indexing Strategy +- Only index fields you need to search on +- Use `FAST` option for frequently filtered numeric fields +- Consider storage vs. search performance trade-offs + +### Query Optimization +- Use specific field queries when possible +- Combine filters with text queries for better performance +- Use pagination with LIMIT for large result sets + +### Memory Usage +- Tantivy indexes are memory-mapped for performance +- Index size depends on document count and field configuration +- Monitor disk space for index storage + +## Integration with Redis Commands + +Search indexes work alongside regular Redis data: + +```bash +# Store product data in Redis hash +HSET product:1 title "Wireless Headphones" price "199.99" + +# Index the same data for search +FT.ADD products product:1 FIELDS title "Wireless Headphones" price 199.99 + +# Search returns document IDs that can be used with Redis commands +FT.SEARCH products "wireless" +# Returns: product:1 + +# Retrieve full data using Redis +HGETALL product:1 +``` + +## Example Use Cases + +### E-commerce Product Search +```bash +# Create product catalog index +FT.CREATE catalog SCHEMA + name TEXT STORED INDEXED TOKENIZED + description TEXT INDEXED TOKENIZED + price NUMERIC STORED INDEXED FAST + category TAG STORED + brand TAG STORED + rating NUMERIC STORED FAST + +# Add products +FT.ADD catalog prod:1 FIELDS name "iPhone 14" price 999 category "phones" brand "apple" rating 4.5 +FT.ADD catalog prod:2 FIELDS name "Samsung Galaxy" price 899 category "phones" brand "samsung" rating 4.3 + +# Search queries +FT.SEARCH catalog "iPhone" +FT.SEARCH catalog "phones" FILTER price 800 1000 +FT.SEARCH catalog "@brand:apple" +``` + +### Content Management +```bash +# Create content index +FT.CREATE content SCHEMA + title TEXT STORED INDEXED TOKENIZED + body TEXT INDEXED TOKENIZED + author TAG STORED + published DATE STORED INDEXED + tags TAG STORED + +# Search content +FT.SEARCH content "machine learning" +FT.SEARCH content "@author:john AND @tags:ai" +FT.SEARCH content "*" FILTER published 1640995200000 1672531200000 +``` + +### Geographic Search +```bash +# Create location-based index +FT.CREATE places SCHEMA + name TEXT STORED INDEXED TOKENIZED + location GEO STORED + type TAG STORED + +# Add locations +FT.ADD places place:1 FIELDS name "Golden Gate Bridge" location "37.8199,-122.4783" type "landmark" + +# Geographic queries (future feature) +FT.SEARCH places "@location:[37.7749 -122.4194 10 km]" +``` + +## Error Handling + +Common error responses: +- `ERR index not found` - Index doesn't exist +- `ERR field not found` - Field not defined in schema +- `ERR invalid query syntax` - Malformed query +- `ERR document not found` - Document ID doesn't exist + +## Best Practices + +1. **Schema Design**: Plan your schema carefully - changes require reindexing +2. **Field Selection**: Only store and index fields you actually need +3. **Batch Operations**: Add multiple documents efficiently +4. **Query Testing**: Test queries for performance with realistic data +5. **Monitoring**: Monitor index size and query performance +6. **Backup**: Include search indexes in backup strategies + +## Future Enhancements + +Planned features: +- Geographic distance queries +- Advanced aggregations and faceting +- Highlighting of search results +- Synonyms and custom analyzers +- Real-time suggestions and autocomplete +- Index replication and sharding \ No newline at end of file diff --git a/herodb/src/tantivy_search.rs b/herodb/src/tantivy_search.rs index 9caaf61..0514b06 100644 --- a/herodb/src/tantivy_search.rs +++ b/herodb/src/tantivy_search.rs @@ -5,7 +5,7 @@ use tantivy::{ schema::{Schema, Field, TextOptions, TextFieldIndexing, STORED, STRING, Value}, Index, IndexWriter, IndexReader, ReloadPolicy, - Term, DateTime, + Term, DateTime, TantivyDocument, tokenizer::{TokenizerManager}, }; use std::path::PathBuf; @@ -231,8 +231,9 @@ impl TantivySearch { let writer = index.writer(50_000_000) .map_err(|e| DBError(format!("Failed to create index writer: {}", e)))?; - let reader = index.reader_builder() - .reload_policy(ReloadPolicy::Manual) + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::OnCommitWithDelay) .try_into() .map_err(|e| DBError(format!("Failed to create reader: {}", e)))?; @@ -360,9 +361,7 @@ impl TantivySearch { // Apply filters if any let final_query = if !options.filters.is_empty() { - let mut queries: Vec<(Occur, Box)> = Vec::new(); - queries.push((Occur::Must, query)); - + let mut clauses: Vec<(Occur, Box)> = vec![(Occur::Must, query)]; // Add filters for filter in options.filters { @@ -373,28 +372,28 @@ impl TantivySearch { Term::from_field_text(*field, &value), tantivy::schema::IndexRecordOption::Basic, ); - queries.push((Occur::Must, Box::new(term_query))); + clauses.push((Occur::Must, Box::new(term_query))); } - FilterType::Range { min, max } => { + FilterType::Range { min: _, max: _ } => { // Would need numeric field handling here // Simplified for now } FilterType::InSet(values) => { - let mut sub_queries: Vec<(Occur, Box)> = Vec::new(); + let mut sub_clauses: Vec<(Occur, Box)> = vec![]; for value in values { let term_query = TermQuery::new( Term::from_field_text(*field, &value), tantivy::schema::IndexRecordOption::Basic, ); - sub_queries.push((Occur::Should, Box::new(term_query))); + sub_clauses.push((Occur::Should, Box::new(term_query))); } - queries.push((Occur::Must, Box::new(BooleanQuery::new(sub_queries)))); + clauses.push((Occur::Must, Box::new(BooleanQuery::new(sub_clauses)))); } } } } - Box::new(BooleanQuery::new(queries)) + Box::new(BooleanQuery::new(clauses)) } else { query }; @@ -409,7 +408,7 @@ impl TantivySearch { let mut documents = Vec::new(); for (score, doc_address) in top_docs.iter().skip(options.offset).take(options.limit) { - let retrieved_doc: tantivy::TantivyDocument = searcher.doc(*doc_address) + let retrieved_doc: TantivyDocument = searcher.doc(*doc_address) .map_err(|e| DBError(format!("Failed to retrieve doc: {}", e)))?; let mut doc_fields = HashMap::new(); @@ -459,13 +458,11 @@ impl TantivySearch { } FieldDef::Geo { stored } => { if *stored { - let lat = retrieved_doc.get_first( - self.index_schema.fields.get(&format!("{}_lat", field_name)).unwrap().0 - ).and_then(|v| v.as_f64()); + let lat_field = self.index_schema.fields.get(&format!("{}_lat", field_name)).unwrap().0; + let lon_field = self.index_schema.fields.get(&format!("{}_lon", field_name)).unwrap().0; - let lon = retrieved_doc.get_first( - self.index_schema.fields.get(&format!("{}_lon", field_name)).unwrap().0 - ).and_then(|v| v.as_f64()); + let lat = retrieved_doc.get_first(lat_field).and_then(|v| v.as_f64()); + let lon = retrieved_doc.get_first(lon_field).and_then(|v| v.as_f64()); if let (Some(lat), Some(lon)) = (lat, lon) { doc_fields.insert(field_name.clone(), format!("{},{}", lat, lon));