This commit is contained in:
2024-12-25 09:23:31 +01:00
parent 01ca5897db
commit 4e030b794d
306 changed files with 35071 additions and 22 deletions

View File

@@ -0,0 +1,77 @@
module texttools
// a comma or \n separated list gets converted to a list of strings .
//'..' also gets converted to without ''
// check also splitsmart which is more intelligent
pub fn to_array(r string) []string {
mut res := []string{}
mut r2 := dedent(r)
r2 = r2.replace(',', '\n')
for mut line in r2.split_into_lines() {
line = line.trim_space()
if line.trim('\'"') == '' {
continue
}
res << line.trim("'")
}
return res
}
pub fn to_array_int(r string) []int {
mut r2 := to_array(r).map(it.int())
return r2
}
// intelligent way how to map a line to a map
//```
// r:=texttools.to_map("name,-,-,-,-,pid,-,-,-,-,path",
// "root 304 0.0 0.0 408185328 1360 ?? S 16Dec23 0:34.06 /usr/sbin/distnoted\n \n")
// assert {'name': 'root', 'pid': '1360', 'path': '/usr/sbin/distnoted'} == r
// r2:=texttools.to_map("name,-,-,-,-,pid,-,-,-,-,path",
// "root 304 0.0 0.0 408185328 1360 ?? S 16Dec23 0:34.06 /usr/sbin/distnoted anotherone anotherone\n \n")
// assert {'name': 'root', 'pid': '1360', 'path': '/usr/sbin/distnoted'} == r2
// r3:=texttools.to_map("name,-,-,-,-,pid,-,-,-,-,path",
// "root 304 0.0 0.0 408185328 1360 ?? S 16Dec23 0:34.06 \n \n")
// assert {'name': 'root', 'pid': '1360', 'path': ''} == r3
//```
pub fn to_map(mapstring string, line string, delimiter_ string) map[string]string {
mapstring_array := split_smart(mapstring, '')
mut line_array := split_smart(line, '')
mut result := map[string]string{}
for x in 0 .. mapstring_array.len {
mapstring_item := mapstring_array[x] or { '' }
if mapstring_item != '-' {
result[mapstring_item] = line_array[x] or { '' }
}
}
return result
}
// smart way how to get useful info out of text block
// ```
// t:='
// _cmiodalassistants 304 0.0 0.0 408185328 1360 ?? S 16Dec23 0:34.06 /usr/sbin/distnoted agent
// _locationd 281 0.0 0.0 408185328 1344 ?? S 16Dec23 0:35.80 /usr/sbin/distnoted agent
// root 275 0.0 0.0 408311904 7296 ?? Ss 16Dec23 2:00.56 /usr/libexec/storagekitd
// _coreaudiod 268 0.0 0.0 408185328 1344 ?? S 16Dec23 0:35.49 /usr/sbin/distnoted agent
// '
// r4:=texttools.to_list_map("name,-,-,-,-,pid,-,-,-,-,path",t)
// assert [{'name': '_cmiodalassistants', 'pid': '1360', 'path': '/usr/sbin/distnoted'},
// {'name': '_locationd', 'pid': '1344', 'path': '/usr/sbin/distnoted'},
// {'name': 'root', 'pid': '7296', 'path': '/usr/libexec/storagekitd'},
// {'name': '_coreaudiod', 'pid': '1344', 'path': '/usr/sbin/distnoted'}] == r4
// ```
pub fn to_list_map(mapstring string, txt_ string, delimiter_ string) []map[string]string {
mut result := []map[string]string{}
mut txt := remove_empty_lines(txt_)
txt = dedent(txt)
for line in txt.split_into_lines() {
result << to_map(mapstring, line, delimiter_)
}
return result
}

103
lib/core/texttools/clean.v Normal file
View File

@@ -0,0 +1,103 @@
// make sure that the names are always normalized so its easy to find them back
module texttools
const ignore_for_name = '\\/[]()?!@#$%^&*<>:;{}|~'
const keep_ascii = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_-+={}[]"\':;?/>.<,|\\~` '
pub fn name_clean(r string) string {
mut res := []string{}
for ch in r {
mut c := ch.ascii_str()
if ignore_for_name.contains(c) {
continue
}
res << c
}
return res.join('')
}
// remove all chars which are not ascii
pub fn ascii_clean(r string) string {
mut res := []string{}
for ch in r {
mut c := ch.ascii_str()
if keep_ascii.contains(c) {
res << c
}
}
return res.join('')
}
// https://en.wikipedia.org/wiki/Unicode#Standardized_subsets
pub fn remove_empty_lines(text string) string {
mut out := []string{}
for l in text.split_into_lines() {
if l.trim_space() == '' {
continue
}
out << l
}
return out.join('\n')
}
pub fn remove_double_lines(text string) string {
mut out := []string{}
mut prev := true
for l in text.split_into_lines() {
if l.trim_space() == '' {
if prev {
continue
}
out << ''
prev = true
continue
}
prev = false
out << l
}
if out.len > 0 && out.last() == '' {
out.pop()
}
return out.join('\n')
}
// remove ```?? ``` , can be over multiple lines .
// also removes double lines
pub fn remove_empty_js_blocks(text string) string {
mut out := []string{}
mut block_capture_pre := ''
mut block_capture_inside := []string{}
mut foundblock := false
for l in text.split_into_lines() {
lt := l.trim_space()
if lt.starts_with('```') || lt.starts_with("'''") || lt.starts_with('"""') {
if foundblock {
if block_capture_inside.filter(it.trim_space() != '').len > 0 {
// now we know the block inside is not empty
out << block_capture_pre
out << block_capture_inside
out << l // the last line
}
foundblock = false
block_capture_pre = ''
block_capture_inside = []string{}
continue
} else {
foundblock = true
block_capture_pre = l
continue
}
}
if foundblock {
block_capture_inside << l
} else {
out << l
}
}
if out.len > 0 && out.last() == '' {
out.pop()
}
return remove_double_lines(out.join('\n'))
}

View File

@@ -0,0 +1,49 @@
module texttools
fn test_clean1() {
mut text := "
'''js
'''
something
yes
else
```js
```
'''js
inside
'''
"
mut result := "
something
yes
else
'''js
inside
'''
"
text = dedent(text)
result = dedent(result)
text2 := remove_double_lines(remove_empty_js_blocks(text))
print('---')
print(text2)
print('---')
print(result)
print('---')
assert text2.trim_space() == result.trim_space()
}

View File

@@ -0,0 +1,106 @@
module texttools
enum TextArgsStatus {
start
quote // quote found means value in between ''
}
// remove all '..' and "..." from a text, so everything in between the quotes
pub fn text_remove_quotes(text string) string {
mut out := ''
mut inquote := false
mut ch := ''
mut char_previous := ''
for i in 0 .. text.len {
ch = text[i..i + 1]
if ch in ['"', "'"] {
if char_previous != '\\' {
inquote = !inquote
char_previous = ch
continue
}
}
if !inquote {
// unmodified add, because we are in quote
out += ch
}
char_previous = ch
}
return out
}
// test if an element off the array exists in the text but ignore quotes
pub fn check_exists_outside_quotes(text string, items []string) bool {
text2 := text_remove_quotes(text)
for i in items {
if text2.contains(i) {
return true
}
}
return false
}
// convert text string to arguments
// \n supported but will be \\n and only supported within '' or ""
// \' not modified, same for \"
pub fn cmd_line_args_parser(text string) ![]string {
mut res := []string{}
mut quote := ''
mut char_previous := ''
mut arg := ''
mut ch := ''
if check_exists_outside_quotes(text, ['<', '>', '|']) {
if !(text.contains(' ')) {
return error("cannot convert text '${text}' to args because no space to split")
}
splitted := text.split_nth(' ', 2)
return [splitted[0], splitted[1]]
}
for i in 0 .. text.len {
ch = text[i..i + 1]
// skip spaces which are not escaped
if ch == ' ' && arg == '' {
continue
}
if ch in ['"', "'"] {
if char_previous != '\\' {
if quote == '' {
// beginning of quote need to close off previous arg
if arg != '' {
res << arg.trim(' ')
arg = ''
}
quote = ch
char_previous = ch
continue
} else {
// end of quote
quote = ''
res << arg.trim(' ')
arg = ''
char_previous = ch
continue
}
}
}
if quote != '' {
// unmodified add, because we are in quote
arg += ch
} else {
if ch == ' ' && arg != '' {
res << arg.trim(' ')
arg = ''
} else {
arg += ch
}
}
char_previous = ch
}
if arg != '' {
res << arg.trim(' ')
}
return res
}

View File

@@ -0,0 +1,38 @@
module texttools
// how to process command lines
fn test_cmdline_args() {
mut r := []string{}
r = cmd_line_args_parser("'aa bb' ' cc dd' one -two") or { panic(err) }
assert r == ['aa bb', 'cc dd', 'one', '-two']
r = cmd_line_args_parser("'\taa bb' ' cc dd' one -two") or { panic(err) }
assert r == ['\taa bb', 'cc dd', 'one', '-two']
// now spaces
r = cmd_line_args_parser(" '\taa bb' ' cc dd' one -two ") or { panic(err) }
assert r == ['\taa bb', 'cc dd', 'one', '-two']
// now other quote
r = cmd_line_args_parser('"aa bb" " cc dd" one -two') or { panic(err) }
assert r == ['aa bb', 'cc dd', 'one', '-two']
r = cmd_line_args_parser('"aa bb" \' cc dd\' one -two') or { panic(err) }
assert r == ['aa bb', 'cc dd', 'one', '-two']
r = cmd_line_args_parser('find . /tmp') or { panic(err) }
assert r == ['find', '.', '/tmp']
r = cmd_line_args_parser("bash -c 'find /'") or { panic(err) }
assert r == ['bash', '-c', 'find /']
mut r2 := string('')
r2 = text_remove_quotes('echo "hi >" > /tmp/a.txt')
assert r2 == 'echo > /tmp/a.txt'
r2 = text_remove_quotes("echo 'hi >' > /tmp/a.txt")
assert r2 == 'echo > /tmp/a.txt'
r2 = text_remove_quotes("echo 'hi >' /tmp/a.txt")
assert r2 == 'echo /tmp/a.txt'
assert check_exists_outside_quotes("echo 'hi >' > /tmp/a.txt", ['<', '>', '|'])
assert check_exists_outside_quotes("echo 'hi ' /tmp/a.txt |", ['<', '>', '|'])
assert !check_exists_outside_quotes("echo 'hi >' /tmp/a.txt", ['<', '>', '|'])
r = cmd_line_args_parser('echo "hi" > /tmp/a.txt') or { panic(err) }
assert r == ['echo', '"hi" > /tmp/a.txt']
}

View File

@@ -0,0 +1,13 @@
module texttools
// texttools.expand('|', 20, ' ')
pub fn expand(txt_ string, l int, expand_with string) string {
mut txt := txt_
for _ in 0 .. l {
txt += expand_with
}
if txt.len > l {
txt = txt[0..l]
}
return txt
}

View File

@@ -0,0 +1,46 @@
module texttools
pub fn indent(text string, prefix string) string {
mut res := []string{}
for line in text.split_into_lines() {
res << prefix + line
}
mut t := res.join_lines()
if !t.ends_with('\n') {
t += '\n'
}
return t
}
// remove all leading spaces at same level
pub fn dedent(text string) string {
mut pre := 999
mut pre_current := 0
mut res := []string{}
text_lines := text.split_into_lines()
for line2 in text_lines {
if line2.trim_space() == '' {
continue
}
line2_expanded_tab := line2.replace('\t', ' ')
line2_expanded_tab_trimmed := line2_expanded_tab.trim_left(' ')
pre_current = line2_expanded_tab.len - line2_expanded_tab_trimmed.len
if pre > pre_current {
pre = pre_current
}
}
// now remove the prefix length
for line2 in text_lines {
line2_expanded_tab := line2.replace('\t', ' ') // important to deal with tabs
line2_expanded_tab_trimmed := line2.trim_space()
if line2_expanded_tab_trimmed == '' {
res << ''
} else {
res << line2_expanded_tab[pre..]
}
}
final_result := res.join_lines()
return final_result
}

View File

@@ -0,0 +1,15 @@
module texttools
fn test_dedent() {
mut text := '
a
b
c
d
'
text = dedent(text)
assert text.len == 20
}

View File

@@ -0,0 +1,31 @@
module texttools
pub fn is_int(text string) bool {
for cha in text {
if cha < 48 || cha > 57 {
return false
}
}
return true
}
pub fn is_upper_text(text string) bool {
for cha in text {
if cha < 65 || cha > 90 {
return false
}
}
return true
}
// fn sid_check(sid string) bool {
// if sid.len > 6 || sid.len < 2 {
// return false
// }
// for cha in sid {
// if (cha < 48 || cha > 57) && (cha < 97 || cha > 122) {
// return false
// }
// }
// return true
// }

View File

@@ -0,0 +1,18 @@
module texttools
fn test_istest1() {
assert is_int('0000')
assert is_int('999')
assert is_int('0')
assert is_int('9')
assert is_int('00 00') == false
assert is_int('00a00') == false
assert is_upper_text('A')
assert is_upper_text('Z')
assert is_upper_text('AAZZZZAAA')
assert is_upper_text('z') == false
assert is_upper_text('AAZZZZaAA') == false
assert is_upper_text('AAZZZZ?AA') == false
assert is_upper_text("AAZZZZ'AA") == false
}

View File

@@ -0,0 +1,163 @@
module texttools
pub enum MultiLineStatus {
start
multiline
comment
}
// converst a multiline to a single line, keeping all relevant information
// empty lines removed (unless if in parameter)
// commented lines removed as well (starts with // and #)
// multiline to 'line1\\nline2\\n'
// dedent also done before putting in '...'
// tabs also replaced to 4x space
pub fn multiline_to_single(text string) !string {
mut multiline_first := ''
mut multiline := ''
// mut comment_first:=""
mut comment := []string{}
mut line2 := ''
mut res := []string{}
mut state := MultiLineStatus.start
for line in text.split_into_lines() {
line2 = line
line2 = line2.replace('\t', ' ')
mut line2_trimmed := line2.trim_space()
if state == .multiline {
if multiline_end_check(line2_trimmed) {
// means we are out of multiline
res << multiline_end(multiline_first, multiline)
multiline_first = ''
multiline = ''
state = .start
} else {
multiline += '${line2}\n'
}
continue
}
if state == .comment {
if comment_end_check(line2_trimmed) {
// means we are out of multiline
res << comment_end(comment)
comment = []string{}
state = .start
} else {
comment << line2_trimmed
continue
}
}
if state == .start {
if line2_trimmed == '' {
continue
}
// deal with comments
mut commentpart := ''
line2_trimmed, commentpart = comment_start_check(mut res, line2_trimmed)
if commentpart.len > 0 {
state = .comment
comment = []string{}
comment << commentpart
continue
}
if multiline_start_check(line2_trimmed) {
// means is multiline
state = .multiline
multiline_first = line2_trimmed
continue
}
res << line2_trimmed.trim('\n ')
}
}
// last one
if state == .multiline {
res << multiline_end(multiline_first, multiline)
}
if state == .comment {
res << comment_end(comment)
}
return res.join(' ')
}
fn multiline_end(multiline_first string, multiline string) string {
mut multiline2 := multiline
multiline2 = dedent(multiline2)
multiline2 = multiline2.replace('\n', '\\\\n')
multiline2 = multiline2.replace("'", '"')
firstline_content := multiline_first.all_after_first(':').trim_left('" \'')
name := multiline_first.all_before(':').trim_space()
if firstline_content.trim_space() != '' {
multiline2 = "${name}:'${multiline_first}\\n${multiline2}'"
} else {
multiline2 = "${name}:'${multiline2}'"
}
return multiline2
}
// check that there is multiline start
fn multiline_start_check(text_ string) bool {
if text_ == '' {
return false
}
text := text_.replace(': ', ':').replace(': ', ':').replace(': ', ':')
for tocheck in [":'", ':"', ':"""', ":'''"] {
if text.ends_with(tocheck) {
return true
}
}
return false
}
fn multiline_end_check(text string) bool {
if text == "'" || text == '"' || text == '"""' || text == "'''" {
return true
}
return false
}
// return all before comment and if comment
// return trimmedtext,commentpart
fn comment_start_check(mut res []string, text_ string) (string, string) {
mut text := text_
if text.starts_with('<!--') {
text = text.replace('<!--', '').trim_space()
return '', text
}
if !(text.contains('//')) {
return text, ''
}
mightbecomment := text.all_after_last('//')
if !(mightbecomment.contains("'")) {
// means we found a comment at end of line, and is not part of string statement (value)
text = text.all_before_last('//').trim_space()
if text.len > 0 {
res << '//${mightbecomment}-/'
return text, ''
} else {
return '', mightbecomment
}
}
return text, ''
}
fn comment_end_check(text string) bool {
if text.ends_with('-->') {
return true
}
if !text.starts_with('//') {
return true
}
return false
}
fn comment_end(comment []string) string {
mut out := []string{}
for line in comment {
out << line.trim(' <->/\n')
}
mut outstr := out.join('\\\\n')
return '//${outstr}-/'
}

View File

@@ -0,0 +1,205 @@
module texttools
fn check_result(tocheck_ string, output string) {
mut tocheck := tocheck_
tocheck = tocheck.replace('\\n', '\\\\n')
// tocheck=tocheck.replace("\'","\\'")
tocheck = tocheck.trim_space()
if tocheck == output.trim_space() {
return
}
panic('required result not correct.')
}
fn test_multiline1() {
mut text := "
id:a1
name:'need to do something 1'
description:'
## markdown works in it
description can be multiline
lets see what happens
'yes, this needs to work too'
- a
- something else
- 'something
### subtitle
```python
#even code block in the other block, crazy parsing for sure
def test():
```
'
"
text = multiline_to_single(text) or { panic(err) }
required_result := 'id:a1 name:\'need to do something 1\' description:\'## markdown works in it\\n\\ndescription can be multiline\\nlets see what happens\\n\\n"yes, this needs to work too"\\n\\n- a\\n- something else\\n- "something\\n\\n### subtitle\\n\\n```python\\n#even code block in the other block, crazy parsing for sure\\ndef test():\\n\\n```\''
check_result(required_result, text)
}
fn test_multiline2() {
mut text := '
id:a1
name:\'need to do something 1\'
description:"
## markdown works in it
description can be multiline
lets see what happens
\'
'
text = multiline_to_single(text) or { panic(err) }
required_result := "id:a1 name:'need to do something 1' description:'## markdown works in it\\n\\ndescription can be multiline\\nlets see what happens'"
check_result(required_result, text)
}
fn test_multiline3() {
mut text := '
id:a1
name:\'need to do something 1\'
description: """
## markdown works in it
description can be multiline
lets see what happens
\'
'
text = multiline_to_single(text) or { panic(err) }
required_result := "id:a1 name:'need to do something 1' description:'## markdown works in it\\n\\ndescription can be multiline\\nlets see what happens'"
check_result(required_result, text)
}
fn test_multiline4() {
mut text := '
id:a1
name:\'need to do something 1\'
description: """
## markdown works in it
description can be multiline
lets see what happens
"""
'
text = multiline_to_single(text) or { panic(err) }
required_result := "id:a1 name:'need to do something 1' description:'## markdown works in it\\n\\ndescription can be multiline\\nlets see what happens'"
check_result(required_result, text)
}
fn test_multiline5() {
mut text := "
id:a1 //comment1
// a comment
name:'need to do something 1'
description: '
## markdown works in it
description can be multiline
lets see what happens
'
//another comment
"
text = multiline_to_single(text) or { panic(err) }
required_result := "//comment1-/ id:a1 //a comment-/ name:'need to do something 1' description:'## markdown works in it\\n\\ndescription can be multiline\\nlets see what happens' //another comment-/"
check_result(required_result, text)
}
fn test_multiline6() {
mut text := "
id:a1 //comment1
// comment m 1
// comment m 2
//
// comment m 3
//
name:'need to do something 1'
description: '
## markdown works in it
description can be multiline
lets see what happens
'
<!--another comment-->
"
text = multiline_to_single(text) or { panic(err) }
required_result := "//comment1-/ id:a1 //comment m 1\\ncomment m 2\\n\\ncomment m 3\\n-/ name:'need to do something 1' description:'## markdown works in it\\n\\ndescription can be multiline\\nlets see what happens' //another comment-/"
check_result(required_result, text)
}
// @[assert_continues]
// fn test_comment_start_check() {
// // TEST: `hello // world, this is mario'`, `hello //world //this is mario`
// mut res := []string{}
// mut str := "hello // world, this is mario'"
// mut text, mut comment := comment_start_check(mut res, str)
// assert text == 'hello'
// assert res == ["// world, this is mario'-/"]
// assert comment == ''
// res = []string{}
// str = 'hello //world //this is mario'
// text, comment = comment_start_check(mut res, str)
// assert text == 'hello'
// assert res == ['//world //this is mario-/']
// assert comment == ''
// }
// @[assert_continues]
// fn test_multiline_start_check() {
// // TEST: `hello '''world:'''`, `hello ' world:'`, `hello " world:"`, `hello """ world: """`
// mut text := ["hello '''world:'''", "hello ' world:'", 'hello " world:"', 'hello """ world: """',
// 'hello world: """\n"""']
// expected := [false, false, false, false, true]
// for idx, input in text {
// got := multiline_start_check(input)
// assert got == expected[idx]
// }
// }
// TODO: not supported yet, requires a Comment Struct, which knows its <!-- format
// fn test_multiline7() {
// mut text := "
// id:a1 //comment1
// <!-- comment m 1
// comment m 2
// comment m 3
// -->
// name:'need to do something 1'
// description: '
// ## markdown works in it
// description can be multiline
// lets see what happens
// '
// <!--another comment-->
// "
// text = multiline_to_single(text) or { panic(err) }
// required_result:="//comment1-/ id:a1 //comment m 1\\ncomment m 2\\n\\ncomment m 3\\n-/ name:'need to do something 1' description:'## markdown works in it\\n\\ndescription can be multiline\\nlets see what happens' //another comment-/"
// check_result(required_result,text)
// }

View File

@@ -0,0 +1,178 @@
// make sure that the names are always normalized so its easy to find them back
module texttools
import os
pub fn email_fix(name string) !string {
mut name2 := name.to_lower().trim_space()
if name2.contains('<') {
name2 = name2.split('<')[1].split('<')[0]
}
if !name2.is_ascii() {
return error('email needs to be ascii, was ${name}')
}
if name2.contains(' ') {
return error('email cannot have spaces, was ${name}')
}
return name2
}
// like name_fix but _ becomes space
pub fn name_fix_keepspace(name string) !string {
mut name2 := name_fix(name)
name2 = name2.replace('_', ' ')
return name2
}
// fix string which represenst a tel nr
pub fn tel_fix(name_ string) !string {
mut name := name_.to_lower().trim_space()
for x in ['[', ']', '{', '}', '(', ')', '*', '-', '.', ' '] {
name = name.replace(x, '')
}
if !name.is_ascii() {
return error('email needs to be ascii, was ${name}')
}
return name
}
pub fn wiki_fix(content_ string) string {
mut content := content_
for _ in 0 .. 5 {
content = content.replace('\n\n\n', '\n\n')
}
content = content.replace('\n\n-', '\n-')
return content
}
pub fn action_multiline_fix(content string) string {
if content.trim_space().contains('\n') {
splitted := content.split('\n')
mut out := '\n'
for item in splitted {
out += ' ${item}\n'
}
return out
}
return content.trim_space()
}
pub fn name_fix(name string) string {
name2 := name_fix_keepext(name)
return name2
}
pub fn name_fix_list(name string) []string {
name2 := name_fix_keepext(name)
return name2.split(',').map(it.trim_space()).map(name_fix(it))
}
// get name back keep extensions and underscores, but when end on .md then remove extension
pub fn name_fix_no_md(name string) string {
name2 := name_fix_keepext(name)
if name2.ends_with('.md') {
name3 := name2[0..name2.len - 3]
return name3
}
return name2
}
pub fn name_fix_no_underscore(name string) string {
mut name2 := name_fix_keepext(name)
x := name2.replace('_', '')
return x
}
pub fn name_fix_snake_to_pascal(name string) string {
x := name.replace('_', ' ')
p := x.title().replace(' ', '')
return p
}
pub fn name_fix_dot_notation_to_pascal(name string) string {
x := name.replace('.', ' ')
p := x.title().replace(' ', '')
return p
}
pub fn name_fix_pascal(name string) string {
name_ := name_fix_snake_to_pascal(name)
return name_fix_dot_notation_to_pascal(name_)
}
pub fn name_fix_pascal_to_snake(name string) string {
mut fixed := ''
for i, c in name {
if c.is_capital() && i != 0 {
fixed += '_'
}
fixed += c.ascii_str()
}
return fixed.to_lower()
}
pub fn name_fix_dot_notation_to_snake_case(name string) string {
return name.replace('.', '_')
}
// remove underscores and extension
pub fn name_fix_no_underscore_no_ext(name_ string) string {
return name_fix_keepext(name_).all_before_last('.').replace('_', '')
}
// remove underscores and extension
pub fn name_fix_no_ext(name_ string) string {
return name_fix_keepext(name_).all_before_last('.').trim_right('_')
}
pub fn name_fix_keepext(name_ string) string {
mut name := name_.to_lower().trim_space()
if name.contains('#') {
old_name := name
name = old_name.split('#')[0]
}
// need to replace . to _ but not the last one (because is ext)
fext := os.file_ext(name)
extension := fext.trim('.')
if extension != '' {
name = name[..(name.len - extension.len - 1)]
}
to_replace_ := '-;:. '
mut to_replace := []u8{}
for i in to_replace_ {
to_replace << i
}
mut out := []u8{}
mut prev := u8(0)
for u in name {
if u == 95 { // underscore
if prev != 95 {
// only when previous is not _
out << u
}
} else if u > 47 && u < 58 { // see https://www.charset.org/utf-8
out << u
} else if u > 96 && u < 123 {
out << u
} else if u in to_replace {
if prev != 95 {
out << u8(95)
}
} else {
// means previous one should not be used
continue
}
prev = u
}
name = out.bytestr()
// name = name.trim(' _') //DONT DO final _ is ok to keep
if extension.len > 0 {
name += '.${extension}'
}
return name
}

View File

@@ -0,0 +1,8 @@
module texttools
fn test_main() {
assert name_fix_keepext('\$sds__ 4F') == 'sds_4f'
assert name_fix_keepext('\$sds_?__ 4F') == 'sds_4f'
assert name_fix_keepext('\$sds_?_!"`{_ 4F') == 'sds_4f'
assert name_fix_keepext('\$sds_?_!"`{_ 4F.jpg') == 'sds_4f.jpg'
}

View File

@@ -0,0 +1,56 @@
module texttools
import os
// return (sitename,pagename)
// sitename will be empty string if not specified with site:... or site__...
pub fn name_split(name string) !(string, string) {
mut objname := name.trim(' ')
objname = objname.trim_left('.')
if name.contains('__') {
parts := name.split('__')
if parts.len != 2 {
return error('filename not well formatted. Needs to have 2 parts around "__". Now ${name}.')
}
objname = '${parts[0].trim(' ')}:${parts[1].trim(' ')}'
}
// to deal with things like "img/tf_world.jpg ':size=300x160'"
splitted0 := objname.split(' ')
if splitted0.len > 0 {
objname = splitted0[0]
}
objname = name_fix(objname)
mut sitename := ''
splitted := objname.split(':')
if splitted.len == 1 {
objname = splitted[0]
} else if splitted.len == 2 {
sitename = splitted[0]
objname = splitted[1]
} else {
return error("name needs to be in format 'sitename:filename' or 'filename', now '${objname}'")
}
objname = objname.trim_left('.')
if objname.contains('/') {
objname = os.base(objname)
if objname.trim(' ') == '' {
return error('objname empty for os.base')
}
}
// make sure we don't have the e.g. img/ in
if objname.trim('/ ') == '' {
return error('objname empty: ${name}')
}
if objname.ends_with('/') {
return error("objname cannot end with /: now '${name}'")
}
if objname.trim(' ') == '' {
return error('objname empty: ${name}')
}
// eprintln(" >> namesplit: '$sitename' '$objname'")
return sitename, objname
}

View File

@@ -0,0 +1,146 @@
# TextTools Module
The TextTools module provides a comprehensive set of utilities for text manipulation and processing in V. It includes functions for cleaning, parsing, formatting, and transforming text in various ways.
## Features
### Array Operations
- `to_array(r string) []string` - Converts a comma or newline separated list to an array of strings
- `to_array_int(r string) []int` - Converts a text list to an array of integers
- `to_map(mapstring string, line string, delimiter_ string) map[string]string` - Intelligent mapping of a line to a map based on a template
### Text Cleaning
- `name_clean(r string) string` - Normalizes names by removing special characters
- `ascii_clean(r string) string` - Removes all non-ASCII characters
- `remove_empty_lines(text string) string` - Removes empty lines from text
- `remove_double_lines(text string) string` - Removes consecutive empty lines
- `remove_empty_js_blocks(text string) string` - Removes empty code blocks (```...```)
### Command Line Parsing
- `cmd_line_args_parser(text string) ![]string` - Parses command line arguments with support for quotes and escaping
- `text_remove_quotes(text string) string` - Removes quoted sections from text
- `check_exists_outside_quotes(text string, items []string) bool` - Checks if items exist in text outside of quotes
### Text Expansion
- `expand(txt_ string, l int, expand_with string) string` - Expands text to a specified length with a given character
### Indentation
- `indent(text string, prefix string) string` - Adds indentation prefix to each line
- `dedent(text string) string` - Removes common leading whitespace from every line
### String Validation
- `is_int(text string) bool` - Checks if text contains only digits
- `is_upper_text(text string) bool` - Checks if text contains only uppercase letters
### Multiline Processing
- `multiline_to_single(text string) !string` - Converts multiline text to a single line with proper escaping
- Handles comments, code blocks, and preserves formatting
### Name/Path Processing
- `name_fix(name string) string` - Normalizes filenames and paths
- `name_fix_keepspace(name string) !string` - Like name_fix but preserves spaces
- `name_fix_no_ext(name_ string) string` - Removes file extension
- `name_fix_snake_to_pascal(name string) string` - Converts snake_case to PascalCase
- `name_fix_pascal_to_snake(name string) string` - Converts PascalCase to snake_case
- `name_split(name string) !(string, string)` - Splits name into site and page components
### Text Splitting
- `split_smart(t string, delimiter_ string) []string` - Intelligent string splitting that respects quotes
### Tokenization
- `tokenize(text_ string) TokenizerResult` - Tokenizes text into meaningful parts
- `text_token_replace(text string, tofind string, replacewith string) !string` - Replaces tokens in text
### Version Parsing
- `version(text_ string) int` - Converts version strings to comparable integers
- Example: "v0.4.36" becomes 4036
- Example: "v1.4.36" becomes 1004036
## Usage Examples
### Array Operations
```v
// Convert comma-separated list to array
text := "item1,item2,item3"
array := texttools.to_array(text)
// Result: ['item1', 'item2', 'item3']
// Smart mapping
r := texttools.to_map("name,-,-,-,-,pid,-,-,-,-,path",
"root 304 0.0 0.0 408185328 1360 ?? S 16Dec23 0:34.06 /usr/sbin/distnoted")
// Result: {'name': 'root', 'pid': '1360', 'path': '/usr/sbin/distnoted'}
```
### Text Cleaning
```v
// Clean name
name := texttools.name_clean("Hello@World!")
// Result: "HelloWorld"
// Remove empty lines
text := texttools.remove_empty_lines("line1\n\nline2\n\n\nline3")
// Result: "line1\nline2\nline3"
```
### Command Line Parsing
```v
// Parse command line with quotes
args := texttools.cmd_line_args_parser("'arg with spaces' --flag=value")
// Result: ['arg with spaces', '--flag=value']
```
### Indentation
```v
// Add indentation
text := texttools.indent("line1\nline2", " ")
// Result: " line1\n line2\n"
// Remove common indentation
text := texttools.dedent(" line1\n line2")
// Result: "line1\nline2"
```
### Name Processing
```v
// Convert to snake case
name := texttools.name_fix_pascal_to_snake("HelloWorld")
// Result: "hello_world"
// Convert to pascal case
name := texttools.name_fix_snake_to_pascal("hello_world")
// Result: "HelloWorld"
```
### Version Parsing
```v
// Parse version string
ver := texttools.version("v0.4.36")
// Result: 4036
ver := texttools.version("v1.4.36")
// Result: 1004036
```
## Error Handling
Many functions in the module return a Result type (indicated by `!` in the function signature). These functions can return errors that should be handled appropriately:
```v
// Example of error handling
name := texttools.name_fix_keepspace("some@name") or {
println("Error: ${err}")
return
}
```
## Best Practices
1. Always use appropriate error handling for functions that return Results
2. Consider using `dedent()` before processing multiline text to ensure consistent formatting
3. When working with filenames, use the appropriate name_fix variant based on your needs
4. For command line parsing, be aware of quote handling and escaping rules
5. When using tokenization, consider the context and whether smart splitting is needed
## Contributing
The TextTools module is part of the heroLib project. Contributions are welcome through pull requests.

View File

@@ -0,0 +1,46 @@
# regex
## basic regex utilities
- .
## regex replacer
Tool to flexibly replace elements in file(s) or text.
next example does it for
```golang
import freeflowuniverse.herolib.core.texttools.regext
text := '
this is test_1 SomeTest
this is Test 1 SomeTest
need to replace TF to ThreeFold
need to replace ThreeFold0 to ThreeFold
need to replace ThreeFold1 to ThreeFold
'
text_out := '
this is TTT SomeTest
this is TTT SomeTest
need to replace ThreeFold to ThreeFold
need to replace ThreeFold to ThreeFold
need to replace ThreeFold to ThreeFold
'
mut ri := regext.regex_instructions_new()
ri.add(['TF:ThreeFold0:ThreeFold1:ThreeFold']) or { panic(err) }
ri.add_item('test_1', 'TTT') or { panic(err) }
ri.add_item('^Stest 1', 'TTT') or { panic(err) } //will be case insensitive search
mut text_out2 := ri.replace(text: text, dedent: true) or { panic(err) }
```

View File

@@ -0,0 +1,41 @@
module regext
import regex
// find parts of text which are in form {NAME}
// .
// NAME is as follows: .
// Lowercase letters: a-z .
// Digits: 0-9 .
// Underscore: _ .
// .
// will return list of the found NAME's
pub fn find_simple_vars(txt string) []string {
pattern := r'\{(\w+)\}'
mut re := regex.regex_opt(pattern) or { panic(err) }
mut words := re.find_all_str(txt)
words = words.map(it.trim('{} '))
return words
}
fn remove_sid(c string) string {
if c.starts_with('sid:') {
return c[4..].trim_space()
}
return c
}
// find parts of text in form sid:abc till sid:abcde (can be a...z 0...9) .
// return list of the found elements .
// to make all e.g. lowercase do e.g. words = words.map(it.to_lower()) after it
pub fn find_sid(txt string) []string {
pattern := r'sid:[a-zA-Z0-9]{3,5}[\s$]'
mut re := regex.regex_opt(pattern) or { panic(err) }
mut words := re.find_all_str(txt)
// words = words.map(it.to_lower())
words = words.map(remove_sid(it))
return words
}

View File

@@ -0,0 +1,47 @@
module regext
fn test_stdtext() {
// this is test without much fancyness, just rext replace, no regex, all case sensitive
text := '
!!action.something sid:aa733
sid:aa733
...sid:aa733 ss
...sid:rrrrrr ss
sid:997
sid:s d
sid:s_d
'
r := find_sid(text)
assert r == ['aa733', 'aa733', 'aa733', '997']
}
fn test_find_simple_vars() {
text := '
!!action.something {sid}
sid:aa733
{a}
...sid:rrrrrr ss {a_sdsdsdsd_e__f_g}
sid:997
sid:s d
sid:s_d
'
r := find_simple_vars(text)
assert r == ['sid', 'a', 'a_sdsdsdsd_e__f_g']
}

View File

@@ -0,0 +1,272 @@
module regext
import freeflowuniverse.herolib.core.texttools
import regex
import freeflowuniverse.herolib.ui.console
import os
pub struct ReplaceInstructions {
pub mut:
instructions []ReplaceInstruction
}
pub struct ReplaceInstruction {
pub:
regex_str string
find_str string
replace_with string
pub mut:
regex regex.RE
}
fn (mut self ReplaceInstructions) get_regex_queries() []string {
mut res := []string{}
for i in self.instructions {
res << i.regex.get_query()
}
return res
}
// rewrite a filter string to a regex .
// each char will be checked for in lower case as well as upper case (will match both) .
// will only look at ascii .
//'_- ' will be replaced to match one or more spaces .
// the returned result is a regex string
pub fn regex_rewrite(r string) !string {
r2 := r.to_lower()
mut res := []string{}
for ch in r2 {
mut c := ch.ascii_str()
if 'abcdefghijklmnopqrstuvwxyz'.contains(c) {
char_upper := c.to_upper()
res << '[' + c + char_upper + ']'
} else if '0123456789'.contains(c) {
res << c
} else if '_- '.contains(c) {
// res << r"\[\\s _\\-\]*"
res << r' *'
} else if '\'"'.contains(c) {
continue
} else if '^&![]'.contains(c) {
return error('cannot rewrite regex: ${r}, found illegal char ^&![]')
}
}
return res.join('')
//+r"[\\n \:\!\.\?;,\\(\\)\\[\\]]"
}
// regex string see https://github.com/vlang/v/blob/master/vlib/regex/README.md .
// find_str is a normal search (text) .
// replace is the string we want to replace the match with
fn (mut self ReplaceInstructions) add_item(regex_find_str string, replace_with string) ! {
mut item := regex_find_str
if item.starts_with('^R') {
item = item[2..] // remove ^r
r := regex.regex_opt(item) or { panic('regex_opt failed') }
self.instructions << ReplaceInstruction{
regex_str: item
regex: r
replace_with: replace_with
}
} else if item.starts_with('^S') {
item = item[2..] // remove ^S
item2 := regex_rewrite(item)!
r := regex.regex_opt(item2) or { panic('regex_opt failed') }
self.instructions << ReplaceInstruction{
regex_str: item
regex: r
replace_with: replace_with
}
} else {
self.instructions << ReplaceInstruction{
replace_with: replace_with
find_str: item
}
}
}
// each element of the list can have more search statements .
// a search statement can have 3 forms.
// - regex start with ^R see https://github.com/vlang/v/blob/master/vlib/regex/README.md .
// - case insensitive string find start with ^S (will internally convert to regex).
// - just a string, this is a literal find (case sensitive) .
// input is ["^Rregex:replacewith",...] .
// input is ["^Rregex:^Rregex2:replacewith"] .
// input is ["findstr:findstr:replacewith"] .
// input is ["findstr:^Rregex2:replacewith"] .
pub fn (mut ri ReplaceInstructions) add(replacelist []string) ! {
for i in replacelist {
splitted := i.split(':')
replace_with := splitted[splitted.len - 1]
// last one not to be used
if splitted.len < 2 {
return error("Cannot add ${i} because needs to have 2 parts, wrong syntax, to regex instructions:\n\"${replacelist}\"")
}
for item in splitted[0..(splitted.len - 1)] {
ri.add_item(item, replace_with)!
}
}
}
// a text input file where each line has one of the following
// - regex start with ^R see https://github.com/vlang/v/blob/master/vlib/regex/README.md .
// - case insensitive string find start with ^S (will internally convert to regex).
// - just a string, this is a literal find (case sensitive) .
// example input
// '''
// ^Rregex:replacewith
// ^Rregex:^Rregex2:replacewith
// ^Sfindstr:replacewith
// findstr:findstr:replacewith
// findstr:^Rregex2:replacewith
// ^Sfindstr:^Sfindstr2::^Rregex2:replacewith
// ''''
pub fn (mut ri ReplaceInstructions) add_from_text(txt string) ! {
mut replacelist := []string{}
for line in txt.split_into_lines() {
if line.trim_space() == '' {
continue
}
if line.contains(':') {
replacelist << line
}
}
ri.add(replacelist)!
}
@[params]
pub struct ReplaceArgs {
pub mut:
text string
dedent bool
}
// this is the actual function which will take text as input and return the replaced result
// does the matching line per line .
// will use dedent function, on text
pub fn (mut self ReplaceInstructions) replace(args ReplaceArgs) !string {
mut gi := 0
mut text2 := args.text
if args.dedent {
text2 = texttools.dedent(text2)
}
mut line2 := ''
mut res := []string{}
if text2.len == 0 {
return ''
}
// check if there is \n at end of text, because of splitlines would be lost
mut endline := false
if text2.ends_with('\n') {
endline = true
}
for line in text2.split_into_lines() {
line2 = line
// mut tl := tokenize(line)
for mut i in self.instructions {
if i.find_str == '' {
all := i.regex.find_all(line)
for gi < all.len {
gi += 2
}
line2 = i.regex.replace(line2, i.replace_with)
} else {
// line2 = line2.replace(i.find_str, i.replace_with)
// line2 = tl.replace(line2, i.find_str, i.replace_with) ?
line2 = line2.replace(i.find_str, i.replace_with)
}
}
res << line2
}
mut x := res.join('\n')
if !endline {
x = x.trim_right('\n')
}
return x
}
@[params]
pub struct ReplaceDirArgs {
pub mut:
path string
extensions []string
dryrun bool
}
// if dryrun is true then will not replace but just show
pub fn (mut self ReplaceInstructions) replace_in_dir(args ReplaceDirArgs) !int {
mut count := 0
// create list of unique extensions all lowercase
mut extensions := []string{}
for ext in args.extensions {
if ext !in extensions {
mut ext2 := ext.to_lower()
if ext2.starts_with('.') {
ext2 = ext2[1..]
}
extensions << ext2
}
}
mut done := []string{}
count += self.replace_in_dir_recursive(args.path, extensions, args.dryrun, mut done)!
return count
}
// returns how many files changed
fn (mut self ReplaceInstructions) replace_in_dir_recursive(path1 string, extensions []string, dryrun bool, mut done []string) !int {
items := os.ls(path1) or {
return error('cannot load folder for replace because cannot find ${path1}')
}
mut pathnew := ''
mut count := 0
for item in items {
pathnew = os.join_path(path1, item)
// CAN DO THIS LATER IF NEEDED
// if pathnew in done{
// continue
// }
// done << pathnew
if os.is_dir(pathnew) {
if item.starts_with('.') {
continue
}
if item.starts_with('_') {
continue
}
self.replace_in_dir_recursive(pathnew, extensions, dryrun, mut done)!
} else {
ext := os.file_ext(pathnew)[1..].to_lower()
if extensions == [] || ext in extensions {
// means we match a file
txtold := os.read_file(pathnew)!
txtnew := self.replace(text: txtold, dedent: false)!
if txtnew.trim(' \n') == txtold.trim(' \n') {
// panic("need to move this file to other lib can't use print_header")
console.print_header(' nothing to do : ${pathnew}')
} else {
// panic("need to move this file to other lib can't use print_header")
console.print_header(' replace done : ${pathnew}')
count++
if !dryrun {
// now write the file back
os.write_file(pathnew, txtnew)!
}
}
}
}
}
return count
}
pub fn regex_instructions_new() ReplaceInstructions {
return ReplaceInstructions{}
}

View File

@@ -0,0 +1,115 @@
module regext
import os
import freeflowuniverse.herolib.core.texttools { dedent }
fn test_stdtext() {
// this is test without much fancyness, just rext replace, no regex, all case sensitive
text := '
this is test_1 SomeTest
this is test 1 SomeTest
need to replace TF to ThreeFold
need to replace ThreeFold0 to ThreeFold
need to replace ThreeFold1 to ThreeFold
'
text_out := '
this is TTT SomeTest
this is TTT SomeTest
need to replace ThreeFold to ThreeFold
need to replace ThreeFold to ThreeFold
need to replace ThreeFold to ThreeFold
'
mut ri := regex_instructions_new()
ri.add(['TF:ThreeFold0:ThreeFold1:ThreeFold']) or { panic(err) }
ri.add_item('test_1', 'TTT') or { panic(err) }
ri.add_item('test 1', 'TTT') or { panic(err) }
mut text_out2 := ri.replace(text: text, dedent: true) or { panic(err) }
assert dedent(text_out2).trim('\n') == dedent(text_out).trim('\n')
}
fn test_dirreplace() {
// this is test without much fancyness, just rext replace, no regex, all case sensitive
// get path where to look for text
mut p := @FILE.split('/')
p = p[0..p.len - 1].clone()
mut path := os.real_path(os.join_path(p.join('/'), 'testdata'))
mut ri := regex_instructions_new()
ri.add(['key_bob:KEY_BOB', 'key_alice:KEY_ALICE']) or { panic(err) }
count := ri.replace_in_dir(path: path, extensions: ['v'], dryrun: true) or { panic(err) }
assert count == 2
}
// fn test_regex1() {
// text := '
// this is test_1 SomeTest
// this is test 1 SomeTest
// need to replace TF to ThreeFold
// need to replace ThreeFold0 to ThreeFold
// need to replace ThreeFold1 to ThreeFold
// '
// text_out := '
// this is TTT SomeTest
// this is TTT SomeTest
// need to replace ThreeFold to ThreeFold
// need to replace ThreeFold to ThreeFold
// need to replace ThreeFold to ThreeFold
// '
// mut ri := regex_instructions_new(['tf:threefold0:^R ThreeFold1:ThreeFold']) or {
// panic(err)
// }
// ri.add('^Rtest[ _]1', 'TTT') or { panic(err) }
// mut text_out2 := ri.replace(text) or { panic(err) }
// assert dedent(text_out2).trim('\n') == dedent(text_out).trim('\n')
// // panic('s')
// }
// fn test_regex2() {
// text := '
// this is test_1 SomeTest
// this is test 1 SomeTest
// need to replace ThreeFold 0 to ThreeFold
// need to replace ThreeFold0 to ThreeFold
// no need to replace ThreeFold1; to ThreeFold
// '
// text_out := '
// '
// mut ri := regex_instructions_new(['^Sthreefold 0:bluelagoon']) or {
// panic(err)
// }
// mut text_out2 := ri.replace(text) or { panic(err) }
// assert dedent(text_out2).trim('\n') == dedent(text_out).trim('\n')
// // panic('s')
// }

View File

@@ -0,0 +1,3 @@
fn testfunction1() {
key_bob = 'bobs key'
}

View File

@@ -0,0 +1,3 @@
fn testfunction2() {
key_alice := 'mock key for regex_test'
}

View File

@@ -0,0 +1,51 @@
module texttools
enum SplitState {
start
string
}
// split strings in intelligent ways, taking into consideration '"`
// ```
// r0:=texttools.split_smart("'root' 304 0.0 0.0 408185328 1360 ?? S 16Dec23 0:34.06 /usr/sbin/distnoted\n \n")
// assert ['root', '304', '0.0', '0.0', '408185328', '1360', '??', 'S', '16Dec23', '0:34.06', '/usr/sbin/distnoted']==r0
// ```
pub fn split_smart(t string, delimiter_ string) []string {
mut st := SplitState.start
mut last := []string{}
mut result := []string{}
mut delimiter := delimiter_
if delimiter.len == 0 {
delimiter = ',| '
}
for c in t.trim_space().split('') {
if st == .start && '`\'"'.contains(c) {
// means we are at start if quoted string
st = .string
continue
}
if st == .string && '`\'"'.contains(c) {
// means we are at end of quoted string
st = .start
result << last.join('').trim_space()
last = []string{}
continue
}
if st == .string {
last << c
continue
}
if delimiter.contains(c) {
if last.len > 0 {
result << last.join('').trim_space()
}
last = []string{}
continue
}
last << c
}
if last.len > 0 {
result << last.join('').trim_space()
}
return result
}

View File

@@ -0,0 +1,13 @@
module texttools
// replace '^^', '@' .
// replace '??', '$' .
// replace '\t', ' ' .
pub fn template_replace(template_ string) string {
mut template := template_
template = template.replace('^^', '@')
template = template.replace('???', '$(')
template = template.replace('??', '$')
template = template.replace('\t', ' ')
return template
}

182
lib/core/texttools/tokens.v Normal file
View File

@@ -0,0 +1,182 @@
module texttools
// import regex
pub struct TokenizerResult {
pub mut:
items []TokenizerItem
}
pub struct TokenizerItem {
pub mut:
toreplace string
// is the most fixed string
matchstring string
}
pub fn text_token_replace(text string, tofind string, replacewith string) !string {
mut tr := tokenize(text)
text2 := tr.replace(text, tofind, replacewith)!
return text2
}
pub fn (mut tr TokenizerResult) replace(text string, tofind string, replacewith string) !string {
tofind2 := name_fix_no_underscore_token(tofind)
mut text2 := text
for item in tr.items {
if item.matchstring == tofind2 {
// text2 = text2.replace(item.toreplace, replacewith)
new_text := text2.replace(item.toreplace, replacewith)
text2 = new_text
///WAS TO GET FULL WORDS TO WORK, IS NOT WORKING !!!!
// if item.matchstring == tofind2 {
// mut new_text := ''
// mut words := text2.split(' ')
// for word in words {
// if word.to_lower() == item.toreplace.to_lower(){
// new_text += word.replace(item.toreplace, replacewith)
// }else {
// new_text += word
// }
// new_text += ' '
// }
// text2 = new_text.trim(' ')
}
// } else {
// }
}
return text2
}
pub fn name_fix_no_underscore_token(name string) string {
item := name_fix_token(name)
newitem := item.replace('_', '')
return newitem
}
// needs to be 2x because can be 3 to 2 to 1
const name_fix_replaces = [
' ',
'_',
'-',
'_',
'__',
'_',
'__',
'_',
'::',
'_',
';',
'_',
':',
'_',
'.',
'_',
]
pub fn name_fix_token(name string) string {
item := name.to_lower()
item_replaced := item.replace_each(name_fix_replaces)
newitem := item_replaced.trim(' ._')
return newitem
}
fn word_skip(text string) bool {
lower_text := text.to_lower()
if lower_text in ['the', 'some', 'and', 'plus', 'will', 'do', 'are', 'these'] {
return true
}
return false
}
pub fn tokenize(text_ string) TokenizerResult {
text := dedent(text_)
mut skip := false
mut skipline := false
mut prev := ''
mut word := ''
mut islink := false
mut tr := TokenizerResult{}
mut done := []string{}
lines := text.split('\n')
//
for original_line in lines {
line := original_line.trim(' ')
if line.starts_with('!') {
continue
}
if line.starts_with('http') {
continue
}
if line.contains("'''") || line.contains('```') || line.contains('"""') {
skipline = !skipline
}
if skipline {
continue
}
prev = ''
word = ''
skip = false
splitted_line := line.split('')
for ch in splitted_line {
if '[({'.contains(ch) {
skip = true
continue
}
if skip {
if ')]}'.contains(ch) {
skip = false
prev = ''
continue
}
} else {
if islink {
if ch == ' ' {
islink = false
} else {
continue
}
}
if 'abcdefghijklmnopqrstuvwxyz0123456789_-'.contains(ch.to_lower()) {
if word.len > 0 || prev == '' || '\t\n ,:;.?!#|'.contains(prev) {
word += ch
}
if word.starts_with('http') {
islink = true
}
} else if '\t\n ,:;.?!#|'.contains(ch) {
// only when end is newline tab or whitespace or ...
if word.len > 1 && !word_skip(word) && word !in done {
word_with_no_underscores := name_fix_no_underscore_token(word)
tr.items << TokenizerItem{
toreplace: word
matchstring: word_with_no_underscores.clone()
}
done << word
}
word = ''
prev = ''
continue
} else {
word = ''
}
prev = ch
}
}
if word.len > 1 && !word_skip(word) && word !in done {
word_with_no_underscores := name_fix_no_underscore_token(word)
tr.items << TokenizerItem{
toreplace: word
matchstring: word_with_no_underscores.clone()
}
done << word
}
}
return tr
}

View File

@@ -0,0 +1,111 @@
module texttools
fn test_tokens() {
mut text := '
these; Are Some ramdom words!
blue lagoon
Blue lagoon
blue_lagoon
blue_Lagoon
lagoon
blueLagoon
&redlagoon
'
r := tokenize(text)
r2 := TokenizerResult{
items: [TokenizerItem{
toreplace: 'ramdom'
matchstring: 'ramdom'
}, TokenizerItem{
toreplace: 'words'
matchstring: 'words'
}, TokenizerItem{
toreplace: 'blue'
matchstring: 'blue'
}, TokenizerItem{
toreplace: 'lagoon'
matchstring: 'lagoon'
}, TokenizerItem{
toreplace: 'Blue'
matchstring: 'blue'
}, TokenizerItem{
toreplace: 'blue_lagoon'
matchstring: 'bluelagoon'
}, TokenizerItem{
toreplace: 'blue_Lagoon'
matchstring: 'bluelagoon'
}, TokenizerItem{
toreplace: 'blueLagoon'
matchstring: 'bluelagoon'
}]
}
assert r == r2
}
// fn test_tokens2() {
// mut text := '
// these; Are Some ramdom words!
// blue lagoon
// Blue lagoon
// red_dragon
// reddragon
// blue_lagoon
// blue_Lagoon
// lagoon
// ;bluelagoon
// '
// mut ri := regex_instructions_new()
// ri.add(['bluelagoon:red_dragon:ThreeFold']) or { panic(err) }
// mut text_out2 := ri.replace(text:text) or { panic(err) }
// compare := '
// these; Are Some ramdom words!
// blue lagoon
// Blue lagoon
// ThreeFold
// ThreeFold
// ThreeFold
// ThreeFold
// lagoon
// ;ThreeFold
// '
// a := dedent(text_out2).trim(' \n')
// b := dedent(compare).trim(' \n')
// assert a == b
// }
fn test_tokens3() {
mut text := r'
- [Definitions](tftech:definitions)
(koekoe)
(great )
{great }
- [Disclaimer](disclaimer)
- [farmer_terms_conditions](terms_conditions_farmer)
- [terms_conditions_websites](terms_conditions_websites) test
- [terms_conditions_griduser](terms_conditions_griduser)
- [privacypolicy](privacypolicy)
http://localhost:9998/threefold/#/farming_certification
https://greencloud
'
r := tokenize(text)
assert r == TokenizerResult{
items: [TokenizerItem{
toreplace: 'test'
matchstring: 'test'
}]
}
}

View File

@@ -0,0 +1,21 @@
module texttools
import math
// v0.4.36 becomes 4036 .
// v1.4.36 becomes 1004036
pub fn version(text_ string) int {
text := text_.to_lower().replace('v', '')
splitted := text.split('.').filter(it.trim_space() != '').reverse().map(it.trim_space().int())
mut nr := 0
mut level := 0
for item in splitted {
mut power := math.powi(1000, level)
nr += item * int(power)
level += 1
}
return nr
}

View File

@@ -0,0 +1,15 @@
module texttools
fn test_version() {
assert version(' v0. 0.36 ') == 36
assert version(' v0.36 ') == 36
assert version(' 36 ') == 36
assert version(' v0. 4.36 ') == 4036
assert version(' v2. 4.36 ') == 2004036
assert version(' 0.18.0 ') == 18000
assert version('
v2. 4.36
') == 2004036
}