* Add detokenize method and test

2025-11-09 12:27:54 +03:00 · 2014-10-18 18:02:05 +11:00 · 2014-10-18 18:02:05 +11:00 · 12742f4f83
commit 12742f4f83
parent df110476d5
2 changed files with 43 additions and 0 deletions
--- a/spacy/util.py
+++ b/spacy/util.py
@ -81,3 +81,25 @@ def align_tokens(ref, indices):
        yield token, emit
        start = end
    assert not queue
+
+
+def detokenize(token_rules, words):
+    """To align with treebanks, return a list of "chunks", where a chunk is a 
+    sequence of tokens that are separated by whitespace in actual strings. Each
+    chunk should be a tuple of token indices, e.g.
+
+    >>> detokenize(["ca<SEP>n't", '<SEP>!'], ["I", "ca", "n't", "!"])
+    [(0,), (1, 2, 3)]
+    """
+    string = ' '.join(words)
+    for subtoks in token_rules:
+        # Algorithmically this is dumb, but writing a little list-based match
+        # machine? Ain't nobody got time for that.
+        string = string.replace(subtoks.replace('<SEP>', ' '), subtoks)
+    positions = []
+    i = 0
+    for chunk in string.split():
+        subtoks = chunk.split('<SEP>')
+        positions.append(tuple(range(i, i+len(subtoks))))
+        i += len(subtoks)
+    return positions
--- a/tests/test_detokenize.py
+++ b/tests/test_detokenize.py
@ -0,0 +1,21 @@
+from spacy.util import detokenize
+
+def test_punct():
+    tokens = 'Pierre Vinken , 61 years old .'.split()
+    detoks = [(0,), (1, 2), (3,), (4,), (5, 6)]
+    token_rules = ('<SEP>,', '<SEP>.')
+    assert detokenize(token_rules, tokens) == detoks
+
+
+def test_contractions():
+    tokens = "I ca n't even".split()
+    detoks = [(0,), (1, 2), (3,)]
+    token_rules = ("ca<SEP>n't",)
+    assert detokenize(token_rules, tokens) == detoks
+
+
+def test_contractions_punct():
+    tokens = "I ca n't !".split()
+    detoks = [(0,), (1, 2, 3)]
+    token_rules = ("ca<SEP>n't", '<SEP>!')
+    assert detokenize(token_rules, tokens) == detoks