* Reading in tokenization rules correctly. Passing tests.

2025-10-21 03:04:27 +03:00 · 2014-07-07 00:02:55 +02:00 · 2014-07-07 00:02:55 +02:00 · 4e79446dc2
commit 4e79446dc2
parent 9bef797afe
6 changed files with 1120 additions and 340 deletions
--- a/spacy/en.cpp
+++ b/spacy/en.cpp
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -10,6 +10,7 @@ from libc.stdint cimport uint64_t
 from spacy.lexeme cimport Lexeme
 from ext.murmurhash cimport MurmurHash64A
 from ext.murmurhash cimport MurmurHash64B
 from . import util
 STRINGS = {}
@ -20,6 +21,23 @@ LEXEMES.set_empty_key(0)
 cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
 def load_tokenization(token_rules):
    cdef Lexeme* word
    cdef StringHash hashed
    for chunk, lex, tokens in token_rules:
        hashed = hash_string(chunk, len(chunk))
        assert LEXEMES[hashed] == NULL
        word = _add(hashed, lex, len(lex), len(lex))
        for i, lex in enumerate(tokens):
            token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
            length = len(token_string)
            hashed = hash_string(token_string, length)
            word.tail = _add(hashed, lex, 0, len(lex))
            word = word.tail
 load_tokenization(util.read_tokenization('en'))
 cpdef Lexeme_addr lookup(unicode string) except 0:
    '''.. function:: enumerate(sequence[, start=0])
    Fetch a Lexeme representing a word string. If the word has not been seen,
@ -156,8 +174,8 @@ cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
 cdef size_t _find_split(unicode word, size_t length):
    cdef int i = 0
    # Contractions
-    if word == "'s":
+    if word.endswith("'s"):
-        return 2
+        return length - 2
    # Leading punctuation
    if is_punct(word, 0, length):
        return 1
@ -166,11 +184,8 @@ cdef size_t _find_split(unicode word, size_t length):
        i = length - 1
        while i >= 2 and is_punct(word, i-1, length):
            i -= 1
    else:
        # Doesn't start or end with the punct
        while i < length and not is_punct(word, i, length):
            i += 1
    return i
 cdef bint is_punct(unicode word, size_t i, size_t length):
    return not word[i].isalnum()
--- a/spacy/lexeme.cpp
+++ b/spacy/lexeme.cpp
@ -1,4 +1,4 @@
-/* Generated by Cython 0.20.1 on Sat Jul  5 20:44:26 2014 */
+/* Generated by Cython 0.20.1 on Mon Jul  7 00:02:26 2014 */
 #define PY_SSIZE_T_CLEAN
 #ifndef CYTHON_USE_PYLONG_INTERNALS
--- a/spacy/spacy.cpp
+++ b/spacy/spacy.cpp
@ -1,4 +1,4 @@
-/* Generated by Cython 0.20.1 on Sat Jul  5 20:44:26 2014 */
+/* Generated by Cython 0.20.1 on Mon Jul  7 00:02:26 2014 */
 #define PY_SSIZE_T_CLEAN
 #ifndef CYTHON_USE_PYLONG_INTERNALS
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,3 +1,10 @@
 import os
 from os import path
 import codecs
 DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
 def utf8open(loc, mode='r'):
    return codecs.open(loc, mode, 'utf8')
@ -12,23 +19,23 @@ def load_case_stats(data_dir):
    return case_stats
-def load_clitics(data_dir):
+def read_tokenization(lang):
-    clitics_loc = path.join(data_dir, 'clitics.txt')
+    loc = path.join(DATA_DIR, lang, 'tokenization')
    entries = []
    seen = set()
-    with utf8open(clitics_loc) as clitics_file:
+    with utf8open(loc) as file_:
-        for line in clitics_file:
+        for line in file_:
            line = line.strip()
            if line.startswith('#'):
                continue
            if not line:
                continue
-            clitics = line.split()
+            pieces = line.split()
-            word = clitics.pop(0)
+            chunk = pieces.pop(0)
-            norm_form = clitics.pop(0)
+            lex = pieces.pop(0)
-            assert word not in seen, word
+            assert chunk not in seen, chunk
-            seen.add(word)
+            seen.add(chunk)
-            entries.append((word, norm_form, clitics))
+            entries.append((chunk, lex, pieces))
    return entries
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -28,3 +28,10 @@ def test_case_neq():
 def test_punct_neq():
    addr = lookup('Hello')
    assert lookup('Hello,') != addr
 def test_short():
    addr = lookup('I')
    assert unhash(lex_of(addr)) == 'I'
    addr = lookup('not')
    assert unhash(lex_of(addr)) == 'not'