diff --git a/spacy/util.py b/spacy/util.py index ec67c5e17..e68bac748 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -70,6 +70,15 @@ def read_tokenization(lang): return entries +def read_detoken_rules(lang): + loc = path.join(DATA_DIR, lang, 'detokenize') + entries = [] + with utf8open(loc) as file_: + for line in file_: + entries.append(line.strip()) + return entries + + def align_tokens(ref, indices): start = 0 queue = list(indices)