mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
* Add function to read detokenization rules
This commit is contained in:
parent
077885637d
commit
43d5964e13
|
@ -70,6 +70,15 @@ def read_tokenization(lang):
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
def read_detoken_rules(lang):
|
||||||
|
loc = path.join(DATA_DIR, lang, 'detokenize')
|
||||||
|
entries = []
|
||||||
|
with utf8open(loc) as file_:
|
||||||
|
for line in file_:
|
||||||
|
entries.append(line.strip())
|
||||||
|
return entries
|
||||||
|
|
||||||
|
|
||||||
def align_tokens(ref, indices):
|
def align_tokens(ref, indices):
|
||||||
start = 0
|
start = 0
|
||||||
queue = list(indices)
|
queue = list(indices)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user