Files
herolib/lib/core/texttools/tokens.v
2024-12-25 09:23:31 +01:00

183 lines
3.7 KiB
V

module texttools
// import regex
pub struct TokenizerResult {
pub mut:
items []TokenizerItem
}
pub struct TokenizerItem {
pub mut:
toreplace string
// is the most fixed string
matchstring string
}
pub fn text_token_replace(text string, tofind string, replacewith string) !string {
mut tr := tokenize(text)
text2 := tr.replace(text, tofind, replacewith)!
return text2
}
pub fn (mut tr TokenizerResult) replace(text string, tofind string, replacewith string) !string {
tofind2 := name_fix_no_underscore_token(tofind)
mut text2 := text
for item in tr.items {
if item.matchstring == tofind2 {
// text2 = text2.replace(item.toreplace, replacewith)
new_text := text2.replace(item.toreplace, replacewith)
text2 = new_text
///WAS TO GET FULL WORDS TO WORK, IS NOT WORKING !!!!
// if item.matchstring == tofind2 {
// mut new_text := ''
// mut words := text2.split(' ')
// for word in words {
// if word.to_lower() == item.toreplace.to_lower(){
// new_text += word.replace(item.toreplace, replacewith)
// }else {
// new_text += word
// }
// new_text += ' '
// }
// text2 = new_text.trim(' ')
}
// } else {
// }
}
return text2
}
pub fn name_fix_no_underscore_token(name string) string {
item := name_fix_token(name)
newitem := item.replace('_', '')
return newitem
}
// needs to be 2x because can be 3 to 2 to 1
const name_fix_replaces = [
' ',
'_',
'-',
'_',
'__',
'_',
'__',
'_',
'::',
'_',
';',
'_',
':',
'_',
'.',
'_',
]
pub fn name_fix_token(name string) string {
item := name.to_lower()
item_replaced := item.replace_each(name_fix_replaces)
newitem := item_replaced.trim(' ._')
return newitem
}
fn word_skip(text string) bool {
lower_text := text.to_lower()
if lower_text in ['the', 'some', 'and', 'plus', 'will', 'do', 'are', 'these'] {
return true
}
return false
}
pub fn tokenize(text_ string) TokenizerResult {
text := dedent(text_)
mut skip := false
mut skipline := false
mut prev := ''
mut word := ''
mut islink := false
mut tr := TokenizerResult{}
mut done := []string{}
lines := text.split('\n')
//
for original_line in lines {
line := original_line.trim(' ')
if line.starts_with('!') {
continue
}
if line.starts_with('http') {
continue
}
if line.contains("'''") || line.contains('```') || line.contains('"""') {
skipline = !skipline
}
if skipline {
continue
}
prev = ''
word = ''
skip = false
splitted_line := line.split('')
for ch in splitted_line {
if '[({'.contains(ch) {
skip = true
continue
}
if skip {
if ')]}'.contains(ch) {
skip = false
prev = ''
continue
}
} else {
if islink {
if ch == ' ' {
islink = false
} else {
continue
}
}
if 'abcdefghijklmnopqrstuvwxyz0123456789_-'.contains(ch.to_lower()) {
if word.len > 0 || prev == '' || '\t\n ,:;.?!#|'.contains(prev) {
word += ch
}
if word.starts_with('http') {
islink = true
}
} else if '\t\n ,:;.?!#|'.contains(ch) {
// only when end is newline tab or whitespace or ...
if word.len > 1 && !word_skip(word) && word !in done {
word_with_no_underscores := name_fix_no_underscore_token(word)
tr.items << TokenizerItem{
toreplace: word
matchstring: word_with_no_underscores.clone()
}
done << word
}
word = ''
prev = ''
continue
} else {
word = ''
}
prev = ch
}
}
if word.len > 1 && !word_skip(word) && word !in done {
word_with_no_underscores := name_fix_no_underscore_token(word)
tr.items << TokenizerItem{
toreplace: word
matchstring: word_with_no_underscores.clone()
}
done << word
}
}
return tr
}