...
This commit is contained in:
@@ -70,6 +70,15 @@ MULTI/EXEC/DISCARD | ✅ | ❌ | Only supported in redb |
|
||||
**Encryption** | | | |
|
||||
Data-at-rest encryption | ✅ | ✅ | Both support [age](age.tech) encryption |
|
||||
AGE commands | ✅ | ✅ | Both support AGE crypto commands |
|
||||
**Full-Text Search** | | | |
|
||||
FT.CREATE | ✅ | ✅ | Create search index with schema |
|
||||
FT.ADD | ✅ | ✅ | Add document to search index |
|
||||
FT.SEARCH | ✅ | ✅ | Search documents with query |
|
||||
FT.DEL | ✅ | ✅ | Delete document from index |
|
||||
FT.INFO | ✅ | ✅ | Get index information |
|
||||
FT.DROP | ✅ | ✅ | Drop search index |
|
||||
FT.ALTER | ✅ | ✅ | Alter index schema |
|
||||
FT.AGGREGATE | ✅ | ✅ | Aggregate search results |
|
||||
|
||||
### Performance Considerations
|
||||
|
||||
|
397
herodb/docs/search.md
Normal file
397
herodb/docs/search.md
Normal file
@@ -0,0 +1,397 @@
|
||||
# Full-Text Search with Tantivy
|
||||
|
||||
HeroDB includes powerful full-text search capabilities powered by [Tantivy](https://github.com/quickwit-oss/tantivy), a fast full-text search engine library written in Rust. This provides Redis-compatible search commands similar to RediSearch.
|
||||
|
||||
## Overview
|
||||
|
||||
The search functionality allows you to:
|
||||
- Create search indexes with custom schemas
|
||||
- Index documents with multiple field types
|
||||
- Perform complex queries with filters
|
||||
- Support for text, numeric, date, and geographic data
|
||||
- Real-time search with high performance
|
||||
|
||||
## Search Commands
|
||||
|
||||
### FT.CREATE - Create Search Index
|
||||
|
||||
Create a new search index with a defined schema.
|
||||
|
||||
```bash
|
||||
FT.CREATE index_name SCHEMA field_name field_type [options] [field_name field_type [options] ...]
|
||||
```
|
||||
|
||||
**Field Types:**
|
||||
- `TEXT` - Full-text searchable text fields
|
||||
- `NUMERIC` - Numeric fields (integers, floats)
|
||||
- `TAG` - Tag fields for exact matching
|
||||
- `GEO` - Geographic coordinates (lat,lon)
|
||||
- `DATE` - Date/timestamp fields
|
||||
|
||||
**Field Options:**
|
||||
- `STORED` - Store field value for retrieval
|
||||
- `INDEXED` - Make field searchable
|
||||
- `TOKENIZED` - Enable tokenization for text fields
|
||||
- `FAST` - Enable fast access for numeric fields
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
# Create a product search index
|
||||
FT.CREATE products SCHEMA
|
||||
title TEXT STORED INDEXED TOKENIZED
|
||||
description TEXT STORED INDEXED TOKENIZED
|
||||
price NUMERIC STORED INDEXED FAST
|
||||
category TAG STORED
|
||||
location GEO STORED
|
||||
created_date DATE STORED INDEXED
|
||||
```
|
||||
|
||||
### FT.ADD - Add Document to Index
|
||||
|
||||
Add a document to a search index.
|
||||
|
||||
```bash
|
||||
FT.ADD index_name doc_id [SCORE score] FIELDS field_name field_value [field_name field_value ...]
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
# Add a product document
|
||||
FT.ADD products product:1 SCORE 1.0 FIELDS
|
||||
title "Wireless Headphones"
|
||||
description "High-quality wireless headphones with noise cancellation"
|
||||
price 199.99
|
||||
category "electronics"
|
||||
location "37.7749,-122.4194"
|
||||
created_date 1640995200000
|
||||
```
|
||||
|
||||
### FT.SEARCH - Search Documents
|
||||
|
||||
Search for documents in an index.
|
||||
|
||||
```bash
|
||||
FT.SEARCH index_name query [LIMIT offset count] [FILTER field min max] [RETURN field [field ...]]
|
||||
```
|
||||
|
||||
**Query Syntax:**
|
||||
- Simple terms: `wireless headphones`
|
||||
- Phrase queries: `"noise cancellation"`
|
||||
- Field-specific: `title:wireless`
|
||||
- Boolean operators: `wireless AND headphones`
|
||||
- Wildcards: `head*`
|
||||
|
||||
**Examples:**
|
||||
```bash
|
||||
# Simple text search
|
||||
FT.SEARCH products "wireless headphones"
|
||||
|
||||
# Search with filters
|
||||
FT.SEARCH products "headphones" FILTER price 100 300 LIMIT 0 10
|
||||
|
||||
# Field-specific search
|
||||
FT.SEARCH products "title:wireless AND category:electronics"
|
||||
|
||||
# Return specific fields only
|
||||
FT.SEARCH products "*" RETURN title price
|
||||
```
|
||||
|
||||
### FT.DEL - Delete Document
|
||||
|
||||
Remove a document from the search index.
|
||||
|
||||
```bash
|
||||
FT.DEL index_name doc_id
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
FT.DEL products product:1
|
||||
```
|
||||
|
||||
### FT.INFO - Get Index Information
|
||||
|
||||
Get information about a search index.
|
||||
|
||||
```bash
|
||||
FT.INFO index_name
|
||||
```
|
||||
|
||||
**Returns:**
|
||||
- Index name and document count
|
||||
- Field definitions and types
|
||||
- Index configuration
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
FT.INFO products
|
||||
```
|
||||
|
||||
### FT.DROP - Drop Index
|
||||
|
||||
Delete an entire search index.
|
||||
|
||||
```bash
|
||||
FT.DROP index_name
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
FT.DROP products
|
||||
```
|
||||
|
||||
### FT.ALTER - Alter Index Schema
|
||||
|
||||
Add new fields to an existing index.
|
||||
|
||||
```bash
|
||||
FT.ALTER index_name SCHEMA ADD field_name field_type [options]
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
FT.ALTER products SCHEMA ADD brand TAG STORED
|
||||
```
|
||||
|
||||
### FT.AGGREGATE - Aggregate Search Results
|
||||
|
||||
Perform aggregations on search results.
|
||||
|
||||
```bash
|
||||
FT.AGGREGATE index_name query [GROUPBY field] [REDUCE function field AS alias]
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
# Group products by category and count
|
||||
FT.AGGREGATE products "*" GROUPBY category REDUCE COUNT 0 AS count
|
||||
```
|
||||
|
||||
## Field Types in Detail
|
||||
|
||||
### TEXT Fields
|
||||
- **Purpose**: Full-text search on natural language content
|
||||
- **Features**: Tokenization, stemming, stop-word removal
|
||||
- **Options**: `STORED`, `INDEXED`, `TOKENIZED`
|
||||
- **Example**: Product titles, descriptions, content
|
||||
|
||||
### NUMERIC Fields
|
||||
- **Purpose**: Numeric data for range queries and sorting
|
||||
- **Types**: I64, U64, F64
|
||||
- **Options**: `STORED`, `INDEXED`, `FAST`
|
||||
- **Example**: Prices, quantities, ratings
|
||||
|
||||
### TAG Fields
|
||||
- **Purpose**: Exact-match categorical data
|
||||
- **Features**: No tokenization, exact string matching
|
||||
- **Options**: `STORED`, case sensitivity control
|
||||
- **Example**: Categories, brands, status values
|
||||
|
||||
### GEO Fields
|
||||
- **Purpose**: Geographic coordinates
|
||||
- **Format**: "latitude,longitude" (e.g., "37.7749,-122.4194")
|
||||
- **Features**: Geographic distance queries
|
||||
- **Options**: `STORED`
|
||||
|
||||
### DATE Fields
|
||||
- **Purpose**: Timestamp and date data
|
||||
- **Format**: Unix timestamp in milliseconds
|
||||
- **Features**: Range queries, temporal filtering
|
||||
- **Options**: `STORED`, `INDEXED`, `FAST`
|
||||
|
||||
## Search Query Syntax
|
||||
|
||||
### Basic Queries
|
||||
```bash
|
||||
# Single term
|
||||
FT.SEARCH products "wireless"
|
||||
|
||||
# Multiple terms (AND by default)
|
||||
FT.SEARCH products "wireless headphones"
|
||||
|
||||
# Phrase query
|
||||
FT.SEARCH products "\"noise cancellation\""
|
||||
```
|
||||
|
||||
### Field-Specific Queries
|
||||
```bash
|
||||
# Search in specific field
|
||||
FT.SEARCH products "title:wireless"
|
||||
|
||||
# Multiple field queries
|
||||
FT.SEARCH products "title:wireless AND description:bluetooth"
|
||||
```
|
||||
|
||||
### Boolean Operators
|
||||
```bash
|
||||
# AND operator
|
||||
FT.SEARCH products "wireless AND headphones"
|
||||
|
||||
# OR operator
|
||||
FT.SEARCH products "wireless OR bluetooth"
|
||||
|
||||
# NOT operator
|
||||
FT.SEARCH products "headphones NOT wired"
|
||||
```
|
||||
|
||||
### Wildcards and Fuzzy Search
|
||||
```bash
|
||||
# Wildcard search
|
||||
FT.SEARCH products "head*"
|
||||
|
||||
# Fuzzy search (approximate matching)
|
||||
FT.SEARCH products "%headphone%"
|
||||
```
|
||||
|
||||
### Range Queries
|
||||
```bash
|
||||
# Numeric range in query
|
||||
FT.SEARCH products "@price:[100 300]"
|
||||
|
||||
# Date range
|
||||
FT.SEARCH products "@created_date:[1640995200000 1672531200000]"
|
||||
```
|
||||
|
||||
## Filtering and Sorting
|
||||
|
||||
### FILTER Clause
|
||||
```bash
|
||||
# Numeric filter
|
||||
FT.SEARCH products "headphones" FILTER price 100 300
|
||||
|
||||
# Multiple filters
|
||||
FT.SEARCH products "*" FILTER price 100 500 FILTER rating 4 5
|
||||
```
|
||||
|
||||
### LIMIT Clause
|
||||
```bash
|
||||
# Pagination
|
||||
FT.SEARCH products "wireless" LIMIT 0 10 # First 10 results
|
||||
FT.SEARCH products "wireless" LIMIT 10 10 # Next 10 results
|
||||
```
|
||||
|
||||
### RETURN Clause
|
||||
```bash
|
||||
# Return specific fields
|
||||
FT.SEARCH products "*" RETURN title price
|
||||
|
||||
# Return all stored fields (default)
|
||||
FT.SEARCH products "*"
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Indexing Strategy
|
||||
- Only index fields you need to search on
|
||||
- Use `FAST` option for frequently filtered numeric fields
|
||||
- Consider storage vs. search performance trade-offs
|
||||
|
||||
### Query Optimization
|
||||
- Use specific field queries when possible
|
||||
- Combine filters with text queries for better performance
|
||||
- Use pagination with LIMIT for large result sets
|
||||
|
||||
### Memory Usage
|
||||
- Tantivy indexes are memory-mapped for performance
|
||||
- Index size depends on document count and field configuration
|
||||
- Monitor disk space for index storage
|
||||
|
||||
## Integration with Redis Commands
|
||||
|
||||
Search indexes work alongside regular Redis data:
|
||||
|
||||
```bash
|
||||
# Store product data in Redis hash
|
||||
HSET product:1 title "Wireless Headphones" price "199.99"
|
||||
|
||||
# Index the same data for search
|
||||
FT.ADD products product:1 FIELDS title "Wireless Headphones" price 199.99
|
||||
|
||||
# Search returns document IDs that can be used with Redis commands
|
||||
FT.SEARCH products "wireless"
|
||||
# Returns: product:1
|
||||
|
||||
# Retrieve full data using Redis
|
||||
HGETALL product:1
|
||||
```
|
||||
|
||||
## Example Use Cases
|
||||
|
||||
### E-commerce Product Search
|
||||
```bash
|
||||
# Create product catalog index
|
||||
FT.CREATE catalog SCHEMA
|
||||
name TEXT STORED INDEXED TOKENIZED
|
||||
description TEXT INDEXED TOKENIZED
|
||||
price NUMERIC STORED INDEXED FAST
|
||||
category TAG STORED
|
||||
brand TAG STORED
|
||||
rating NUMERIC STORED FAST
|
||||
|
||||
# Add products
|
||||
FT.ADD catalog prod:1 FIELDS name "iPhone 14" price 999 category "phones" brand "apple" rating 4.5
|
||||
FT.ADD catalog prod:2 FIELDS name "Samsung Galaxy" price 899 category "phones" brand "samsung" rating 4.3
|
||||
|
||||
# Search queries
|
||||
FT.SEARCH catalog "iPhone"
|
||||
FT.SEARCH catalog "phones" FILTER price 800 1000
|
||||
FT.SEARCH catalog "@brand:apple"
|
||||
```
|
||||
|
||||
### Content Management
|
||||
```bash
|
||||
# Create content index
|
||||
FT.CREATE content SCHEMA
|
||||
title TEXT STORED INDEXED TOKENIZED
|
||||
body TEXT INDEXED TOKENIZED
|
||||
author TAG STORED
|
||||
published DATE STORED INDEXED
|
||||
tags TAG STORED
|
||||
|
||||
# Search content
|
||||
FT.SEARCH content "machine learning"
|
||||
FT.SEARCH content "@author:john AND @tags:ai"
|
||||
FT.SEARCH content "*" FILTER published 1640995200000 1672531200000
|
||||
```
|
||||
|
||||
### Geographic Search
|
||||
```bash
|
||||
# Create location-based index
|
||||
FT.CREATE places SCHEMA
|
||||
name TEXT STORED INDEXED TOKENIZED
|
||||
location GEO STORED
|
||||
type TAG STORED
|
||||
|
||||
# Add locations
|
||||
FT.ADD places place:1 FIELDS name "Golden Gate Bridge" location "37.8199,-122.4783" type "landmark"
|
||||
|
||||
# Geographic queries (future feature)
|
||||
FT.SEARCH places "@location:[37.7749 -122.4194 10 km]"
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
Common error responses:
|
||||
- `ERR index not found` - Index doesn't exist
|
||||
- `ERR field not found` - Field not defined in schema
|
||||
- `ERR invalid query syntax` - Malformed query
|
||||
- `ERR document not found` - Document ID doesn't exist
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Schema Design**: Plan your schema carefully - changes require reindexing
|
||||
2. **Field Selection**: Only store and index fields you actually need
|
||||
3. **Batch Operations**: Add multiple documents efficiently
|
||||
4. **Query Testing**: Test queries for performance with realistic data
|
||||
5. **Monitoring**: Monitor index size and query performance
|
||||
6. **Backup**: Include search indexes in backup strategies
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
Planned features:
|
||||
- Geographic distance queries
|
||||
- Advanced aggregations and faceting
|
||||
- Highlighting of search results
|
||||
- Synonyms and custom analyzers
|
||||
- Real-time suggestions and autocomplete
|
||||
- Index replication and sharding
|
@@ -5,7 +5,7 @@ use tantivy::{
|
||||
schema::{Schema, Field, TextOptions, TextFieldIndexing,
|
||||
STORED, STRING, Value},
|
||||
Index, IndexWriter, IndexReader, ReloadPolicy,
|
||||
Term, DateTime,
|
||||
Term, DateTime, TantivyDocument,
|
||||
tokenizer::{TokenizerManager},
|
||||
};
|
||||
use std::path::PathBuf;
|
||||
@@ -231,8 +231,9 @@ impl TantivySearch {
|
||||
let writer = index.writer(50_000_000)
|
||||
.map_err(|e| DBError(format!("Failed to create index writer: {}", e)))?;
|
||||
|
||||
let reader = index.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommitWithDelay)
|
||||
.try_into()
|
||||
.map_err(|e| DBError(format!("Failed to create reader: {}", e)))?;
|
||||
|
||||
@@ -360,9 +361,7 @@ impl TantivySearch {
|
||||
|
||||
// Apply filters if any
|
||||
let final_query = if !options.filters.is_empty() {
|
||||
let mut queries: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
||||
queries.push((Occur::Must, query));
|
||||
|
||||
let mut clauses: Vec<(Occur, Box<dyn Query>)> = vec![(Occur::Must, query)];
|
||||
|
||||
// Add filters
|
||||
for filter in options.filters {
|
||||
@@ -373,28 +372,28 @@ impl TantivySearch {
|
||||
Term::from_field_text(*field, &value),
|
||||
tantivy::schema::IndexRecordOption::Basic,
|
||||
);
|
||||
queries.push((Occur::Must, Box::new(term_query)));
|
||||
clauses.push((Occur::Must, Box::new(term_query)));
|
||||
}
|
||||
FilterType::Range { min, max } => {
|
||||
FilterType::Range { min: _, max: _ } => {
|
||||
// Would need numeric field handling here
|
||||
// Simplified for now
|
||||
}
|
||||
FilterType::InSet(values) => {
|
||||
let mut sub_queries: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
||||
let mut sub_clauses: Vec<(Occur, Box<dyn Query>)> = vec![];
|
||||
for value in values {
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(*field, &value),
|
||||
tantivy::schema::IndexRecordOption::Basic,
|
||||
);
|
||||
sub_queries.push((Occur::Should, Box::new(term_query)));
|
||||
sub_clauses.push((Occur::Should, Box::new(term_query)));
|
||||
}
|
||||
queries.push((Occur::Must, Box::new(BooleanQuery::new(sub_queries))));
|
||||
clauses.push((Occur::Must, Box::new(BooleanQuery::new(sub_clauses))));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Box::new(BooleanQuery::new(queries))
|
||||
Box::new(BooleanQuery::new(clauses))
|
||||
} else {
|
||||
query
|
||||
};
|
||||
@@ -409,7 +408,7 @@ impl TantivySearch {
|
||||
let mut documents = Vec::new();
|
||||
|
||||
for (score, doc_address) in top_docs.iter().skip(options.offset).take(options.limit) {
|
||||
let retrieved_doc: tantivy::TantivyDocument = searcher.doc(*doc_address)
|
||||
let retrieved_doc: TantivyDocument = searcher.doc(*doc_address)
|
||||
.map_err(|e| DBError(format!("Failed to retrieve doc: {}", e)))?;
|
||||
|
||||
let mut doc_fields = HashMap::new();
|
||||
@@ -459,13 +458,11 @@ impl TantivySearch {
|
||||
}
|
||||
FieldDef::Geo { stored } => {
|
||||
if *stored {
|
||||
let lat = retrieved_doc.get_first(
|
||||
self.index_schema.fields.get(&format!("{}_lat", field_name)).unwrap().0
|
||||
).and_then(|v| v.as_f64());
|
||||
let lat_field = self.index_schema.fields.get(&format!("{}_lat", field_name)).unwrap().0;
|
||||
let lon_field = self.index_schema.fields.get(&format!("{}_lon", field_name)).unwrap().0;
|
||||
|
||||
let lon = retrieved_doc.get_first(
|
||||
self.index_schema.fields.get(&format!("{}_lon", field_name)).unwrap().0
|
||||
).and_then(|v| v.as_f64());
|
||||
let lat = retrieved_doc.get_first(lat_field).and_then(|v| v.as_f64());
|
||||
let lon = retrieved_doc.get_first(lon_field).and_then(|v| v.as_f64());
|
||||
|
||||
if let (Some(lat), Some(lon)) = (lat, lon) {
|
||||
doc_fields.insert(field_name.clone(), format!("{},{}", lat, lon));
|
||||
|
Reference in New Issue
Block a user