From 01469b088825ed151fd1c828817887e1959e1ee0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 18 Aug 2014 19:14:00 +0200 Subject: [PATCH] * Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. --- setup.py | 21 ++--- spacy/__init__.py | 14 ++-- spacy/en.pxd | 2 +- spacy/en.pyx | 10 ++- spacy/en_ptb.pxd | 2 +- spacy/en_ptb.pyx | 5 +- spacy/lexeme.pxd | 4 +- spacy/lexeme.pyx | 16 +--- spacy/spacy.pxd | 21 +++-- spacy/spacy.pyx | 152 ++++++++++++----------------------- spacy/tokens.pxd | 1 + spacy/util.py | 7 +- tests/test_contractions.py | 19 +++-- tests/test_group_by.py | 2 +- tests/test_orth.py | 2 +- tests/test_post_punct.py | 11 +-- tests/test_pre_punct.py | 16 ++-- tests/test_rules.py | 6 +- tests/test_surround_punct.py | 9 +-- tests/test_tokenizer.py | 15 ++-- 20 files changed, 123 insertions(+), 212 deletions(-) diff --git a/setup.py b/setup.py index 1b0093808..4abaf4ae4 100644 --- a/setup.py +++ b/setup.py @@ -39,29 +39,20 @@ cython_includes = ['.'] if 'VIRTUAL_ENV' in os.environ: includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'include', 'site', '*')) - cython_includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'lib', '*')) else: # If you're not using virtualenv, set your include dir here. pass exts = [ + Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), Extension("spacy.en", ["spacy/en.pyx"], language="c++", - include_dirs=includes, cython_include_dirs=cython_includes), - Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes, - cython_include_dirs=cython_includes), - Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes, - cython_include_dirs=cython_includes), - Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes, - cython_include_dirs=cython_includes), - Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes, - cython_include_dirs=cython_includes), - Extension("spacy.chartree", ["spacy/chartree.pyx"], language="c++", include_dirs=includes, - cython_include_dirs=cython_includes), - Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes, - cython_include_dirs=cython_includes), + include_dirs=includes), + Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes), + Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes), + Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes), Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++", - include_dirs=includes, cython_include_dirs=cython_includes), + include_dirs=includes), ] diff --git a/spacy/__init__.py b/spacy/__init__.py index 9f7c7932c..16d71aec6 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,16 +1,14 @@ from .lexeme import lex_of -from .lexeme import sic_of from .lexeme import length_of from .tokens import Tokens # Don't know how to get the enum Python visible :( -SIC = 0 -LEX = 1 -NORM = 2 -SHAPE = 3 -LAST3 = 4 -LENGTH = 5 +LEX = 0 +NORM = 1 +SHAPE = 2 +LAST3 = 3 +LENGTH = 4 -__all__ = [Tokens, lex_of, sic_of, length_of, SIC, LEX, NORM, SHAPE, LAST3, LENGTH] +__all__ = [Tokens, lex_of, length_of, LEX, NORM, SHAPE, LAST3, LENGTH] diff --git a/spacy/en.pxd b/spacy/en.pxd index ee58118a9..9f0edb791 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -9,7 +9,7 @@ from spacy.tokens cimport Tokens cdef class English(spacy.Language): - cdef int find_split(self, unicode word, size_t length) + cdef int find_split(self, unicode word) cdef English EN diff --git a/spacy/en.pyx b/spacy/en.pyx index 3245d8fa9..f90af1549 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -17,10 +17,13 @@ cimport spacy cdef class English(spacy.Language): - cdef int find_split(self, unicode word, size_t length): + cdef int find_split(self, unicode word): + cdef size_t length = len(word) cdef int i = 0 + if word.startswith("'s") or word.startswith("'S"): + return 2 # Contractions - if word.endswith("'s"): + if word.endswith("'s") and length >= 3: return length - 2 # Leading punctuation if is_punct(word, 0, length): @@ -36,7 +39,6 @@ cdef class English(spacy.Language): cdef bint is_punct(unicode word, size_t i, size_t length): # Don't count appostrophes as punct if the next char is a letter if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): - # ...Unless we're at 0 return i == 0 if word[i] == "-" and i < (length - 1) and word[i+1] == '-': return False @@ -57,7 +59,7 @@ cpdef Tokens tokenize(unicode string): cpdef Lexeme_addr lookup(unicode string) except 0: - return EN.lookup_chunk(string) + return EN.lookup(string) cpdef unicode unhash(StringHash hash_value): diff --git a/spacy/en_ptb.pxd b/spacy/en_ptb.pxd index eaa0f8471..2f139a94f 100644 --- a/spacy/en_ptb.pxd +++ b/spacy/en_ptb.pxd @@ -8,7 +8,7 @@ from spacy.tokens cimport Tokens cdef class EnglishPTB(Language): - cdef int find_split(self, unicode word, size_t length) + cdef int find_split(self, unicode word) cdef EnglishPTB EN_PTB diff --git a/spacy/en_ptb.pyx b/spacy/en_ptb.pyx index 078b91b40..f70b26d45 100644 --- a/spacy/en_ptb.pyx +++ b/spacy/en_ptb.pyx @@ -17,7 +17,8 @@ cimport spacy cdef class EnglishPTB(Language): - cdef int find_split(self, unicode word, size_t length): + cdef int find_split(self, unicode word): + length = len(word) cdef int i = 0 # Contractions if word.endswith("'s"): @@ -53,7 +54,7 @@ cpdef Tokens tokenize(unicode string): cpdef Lexeme_addr lookup(unicode string) except 0: - return EN_PTB.lookup_chunk(string) + return EN_PTB.lookup_chunk(string) cpdef unicode unhash(StringHash hash_value): diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 145a043c8..90d06587e 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -32,14 +32,13 @@ cdef struct Lexeme: Distribution* dist # Distribution info, lazy loaded Orthography* orth # Extra orthographic views - Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens + #Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL) cdef enum StringAttr: - SIC LEX NORM SHAPE @@ -49,7 +48,6 @@ cdef enum StringAttr: cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0 -cpdef StringHash sic_of(size_t lex_id) except 0 cpdef StringHash lex_of(size_t lex_id) except 0 cpdef StringHash norm_of(size_t lex_id) except 0 cpdef StringHash shape_of(size_t lex_id) except 0 diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 430033db0..42c93ec60 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -22,9 +22,7 @@ from spacy.spacy cimport StringHash cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0: - if attr == SIC: - return sic_of(lex_id) - elif attr == LEX: + if attr == LEX: return lex_of(lex_id) elif attr == NORM: return norm_of(lex_id) @@ -38,18 +36,6 @@ cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0: raise StandardError -cpdef StringHash sic_of(size_t lex_id) except 0: - '''Access the `sic' field of the Lexeme pointed to by lex_id. - - The sic field stores the hash of the whitespace-delimited string-chunk used to - construct the Lexeme. - - >>> [unhash(sic_of(lex_id)) for lex_id in from_string(u'Hi! world')] - [u'Hi!', u'', u'world] - ''' - return (lex_id).sic - - cpdef StringHash lex_of(size_t lex_id) except 0: '''Access the `lex' field of the Lexeme pointed to by lex_id. diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index 3afc9a467..813eaa438 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -3,8 +3,6 @@ from libc.stdint cimport uint32_t from libc.stdint cimport uint64_t from sparsehash.dense_hash_map cimport dense_hash_map -from _hashing cimport FixedTable -from _hashing cimport WordTree # Circular import problems here ctypedef size_t Lexeme_addr @@ -28,22 +26,21 @@ from spacy._hashing cimport WordTree cdef class Language: cdef object name - cdef WordTree vocab - cdef WordTree distri - cdef WordTree ortho + cdef dense_hash_map[StringHash, size_t] chunks + cdef dense_hash_map[StringHash, size_t] vocab cdef dict bacov - cpdef Tokens tokenize(self, unicode text) + cdef Tokens tokenize(self, unicode text) - cdef Lexeme_addr lookup(self, unicode string) except 0 - cdef Lexeme_addr lookup_chunk(self, unicode string) except 0 - cdef Orthography* lookup_orth(self, unicode lex) except NULL - cdef Distribution* lookup_dist(self, unicode lex) except NULL + cdef Lexeme* lookup(self, unicode string) except NULL + cdef Lexeme** lookup_chunk(self, unicode string) except NULL - cdef Lexeme* new_lexeme(self, unicode key, unicode lex) except NULL + cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL + cdef Lexeme* new_lexeme(self, unicode lex) except NULL cdef Orthography* new_orth(self, unicode lex) except NULL cdef Distribution* new_dist(self, unicode lex) except NULL cdef unicode unhash(self, StringHash hashed) - cdef int find_split(self, unicode word, size_t length) + cpdef list find_substrings(self, unicode word) + cdef int find_split(self, unicode word) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index d49138801..7da7c475f 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -5,7 +5,6 @@ from libc.stdlib cimport calloc, free from libcpp.pair cimport pair from cython.operator cimport dereference as deref -from murmurhash cimport mrmr from spacy.lexeme cimport Lexeme from spacy.lexeme cimport BLANK_WORD @@ -64,86 +63,56 @@ cdef class Language: def __cinit__(self, name): self.name = name self.bacov = {} - self.vocab = WordTree(0, 5) - self.ortho = WordTree(0, 5) - self.distri = WordTree(0, 5) + self.chunks = dense_hash_map[StringHash, size_t]() + self.vocab = dense_hash_map[StringHash, size_t]() + self.chunks.set_empty_key(0) + self.vocab.set_empty_key(0) self.load_tokenization(util.read_tokenization(name)) - cpdef Tokens tokenize(self, unicode characters): + cdef Tokens tokenize(self, unicode characters): cdef size_t i = 0 cdef size_t start = 0 - + cdef Lexeme** chunk cdef Tokens tokens = Tokens(self) - cdef Lexeme* token - for c in characters: - if _is_whitespace(c): - if start < i: - token = self.lookup_chunk(characters[start:i]) - while token != NULL: - tokens.append(token) - token = token.tail - start = i + 1 - i += 1 - if start < i: - token = self.lookup_chunk(characters[start:]) - while token != NULL: - tokens.append(token) - token = token.tail + for chunk_str in characters.split(): + chunk = self.lookup_chunk(chunk_str) + i = 0 + while chunk[i] != NULL: + tokens.append(chunk[i]) + i += 1 return tokens - cdef Lexeme_addr lookup(self, unicode string) except 0: - cdef size_t length = len(string) - if length == 0: - return &BLANK_WORD + cdef Lexeme* lookup(self, unicode string) except NULL: + if len(string) == 0: + return &BLANK_WORD + cdef Lexeme* word = self.vocab[hash(string)] + if word == NULL: + word = self.new_lexeme(string) + return word - cdef StringHash hashed = hash(string) - # First, check words seen 2+ times - cdef Lexeme* word_ptr = self.vocab.get(string) - if word_ptr == NULL: - word_ptr = self.new_lexeme(string, string) - return word_ptr - - cdef Lexeme_addr lookup_chunk(self, unicode string) except 0: - '''Fetch a Lexeme representing a word string. If the word has not been seen, - construct one, splitting off any attached punctuation or clitics. A - reference to BLANK_WORD is returned for the empty string. - ''' - cdef size_t length = len(string) - if length == 0: - return &BLANK_WORD - # First, check words seen 2+ times - cdef Lexeme* word_ptr = self.vocab.get(string) + cdef Lexeme** lookup_chunk(self, unicode string) except NULL: + assert len(string) != 0 + cdef Lexeme** chunk = self.chunks[hash(string)] cdef int split - if word_ptr == NULL: - split = self.find_split(string, length) - if split != 0 and split != -1 and split < length: - word_ptr = self.new_lexeme(string, string[:split]) - word_ptr.tail = self.lookup_chunk(string[split:]) - else: - word_ptr = self.new_lexeme(string, string) - return word_ptr + if chunk == NULL: + chunk = self.new_chunk(string, self.find_substrings(string)) + return chunk - cdef Orthography* lookup_orth(self, unicode lex): - cdef Orthography* orth = self.ortho.get(lex) - if orth == NULL: - orth = self.new_orth(lex) - return orth + cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL: + cdef Lexeme** chunk = calloc(len(substrings) + 1, sizeof(Lexeme*)) + for i, substring in enumerate(substrings): + chunk[i] = self.lookup(substring) + chunk[i + 1] = NULL + self.chunks[hash(string)] = chunk + return chunk - cdef Distribution* lookup_dist(self, unicode lex): - cdef Distribution* dist = self.distri.get(lex) - if dist == NULL: - dist = self.new_dist(lex) - return dist - - cdef Lexeme* new_lexeme(self, unicode key, unicode string) except NULL: + cdef Lexeme* new_lexeme(self, unicode string) except NULL: cdef Lexeme* word = calloc(1, sizeof(Lexeme)) - word.sic = hash(key) word.lex = hash(string) self.bacov[word.lex] = string - self.bacov[word.sic] = key - word.orth = self.lookup_orth(string) - word.dist = self.lookup_dist(string) - self.vocab.set(key, word) + word.orth = self.new_orth(string) + word.dist = self.new_dist(string) + self.vocab[word.lex] = word return word cdef Orthography* new_orth(self, unicode lex) except NULL: @@ -170,30 +139,33 @@ cdef class Language: self.bacov[orth.norm] = norm self.bacov[orth.shape] = shape - self.ortho.set(lex, orth) return orth cdef Distribution* new_dist(self, unicode lex) except NULL: dist = calloc(1, sizeof(Distribution)) - self.distri.set(lex, dist) return dist cdef unicode unhash(self, StringHash hash_value): '''Fetch a string from the reverse index, given its hash value.''' return self.bacov[hash_value] - cdef int find_split(self, unicode word, size_t length): - return -1 + cpdef list find_substrings(self, unicode word): + substrings = [] + while word: + split = self.find_split(word) + if split == 0: + substrings.append(word) + break + substrings.append(word[:split]) + word = word[split:] + return substrings + + cdef int find_split(self, unicode word): + return len(word) def load_tokenization(self, token_rules=None): - cdef Lexeme* word - cdef StringHash hashed - for chunk, lex, tokens in token_rules: - word = self.new_lexeme(chunk, lex) - for i, lex in enumerate(tokens): - token_string = '%s:@:%d:@:%s' % (chunk, i, lex) - word.tail = self.new_lexeme(token_string, lex) - word = word.tail + for chunk, tokens in token_rules: + self.new_chunk(chunk, tokens) def load_clusters(self): cdef Lexeme* w @@ -209,24 +181,4 @@ cdef class Language: # the first 4 bits. See redshift._parse_features.pyx cluster = int(cluster_str[::-1], 2) upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0)) - word = self.new_lexeme(token_string, token_string) - - -cdef inline bint _is_whitespace(unsigned char c) nogil: - if c == b' ': - return True - elif c == b'\n': - return True - elif c == b'\t': - return True - else: - return False - - -cpdef vector[size_t] expand_chunk(size_t addr) except *: - cdef vector[size_t] tokens = vector[size_t]() - word = addr - while word != NULL: - tokens.push_back(word) - word = word.tail - return tokens + self.new_lexeme(token_string) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 5359761c0..ba692280f 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -1,5 +1,6 @@ from libcpp.vector cimport vector from spacy.spacy cimport Lexeme_addr +from spacy.lexeme cimport Lexeme from cython.operator cimport dereference as deref from spacy.spacy cimport Language diff --git a/spacy/util.py b/spacy/util.py index 4e080d0b3..4d12014ca 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -32,13 +32,12 @@ def read_tokenization(lang): continue pieces = line.split() chunk = pieces.pop(0) - lex = pieces.pop(0) assert chunk not in seen, chunk seen.add(chunk) - entries.append((chunk, lex, pieces)) + entries.append((chunk, list(pieces))) if chunk[0].isalpha() and chunk[0].islower(): chunk = chunk[0].title() + chunk[1:] - lex = lex[0].title() + lex[1:] + pieces[0] = pieces[0][0].title() + pieces[0][1:] seen.add(chunk) - entries.append((chunk, lex, pieces)) + entries.append((chunk, pieces)) return entries diff --git a/tests/test_contractions.py b/tests/test_contractions.py index aa11faa39..1839b15f5 100644 --- a/tests/test_contractions.py +++ b/tests/test_contractions.py @@ -1,44 +1,43 @@ from __future__ import unicode_literals -from spacy.spacy import expand_chunk -from spacy.en import lookup, unhash +from spacy.en import tokenize, lookup, unhash from spacy import lex_of def test_possess(): - tokens = expand_chunk(lookup("Mike's")) - assert len(tokens) == 2 + tokens = tokenize("Mike's") assert unhash(lex_of(tokens[0])) == "Mike" assert unhash(lex_of(tokens[1])) == "'s" + assert len(tokens) == 2 def test_apostrophe(): - tokens = expand_chunk(lookup("schools'")) + tokens = tokenize("schools'") assert len(tokens) == 2 assert unhash(lex_of(tokens[1])) == "'" assert unhash(lex_of(tokens[0])) == "schools" def test_LL(): - tokens = expand_chunk(lookup("we'll")) + tokens = tokenize("we'll") assert len(tokens) == 2 assert unhash(lex_of(tokens[1])) == "will" assert unhash(lex_of(tokens[0])) == "we" def test_aint(): - tokens = expand_chunk(lookup("ain't")) + tokens = tokenize("ain't") assert len(tokens) == 2 assert unhash(lex_of(tokens[0])) == "are" assert unhash(lex_of(tokens[1])) == "not" def test_capitalized(): - tokens = expand_chunk(lookup("can't")) + tokens = tokenize("can't") assert len(tokens) == 2 - tokens = expand_chunk(lookup("Can't")) + tokens = tokenize("Can't") assert len(tokens) == 2 - tokens = expand_chunk(lookup("Ain't")) + tokens = tokenize("Ain't") assert len(tokens) == 2 assert unhash(lex_of(tokens[0])) == "Are" diff --git a/tests/test_group_by.py b/tests/test_group_by.py index 2f9dd6ce0..9f83c5ce9 100644 --- a/tests/test_group_by.py +++ b/tests/test_group_by.py @@ -5,7 +5,7 @@ import pytest from spacy import en from spacy.lexeme import lex_of -from spacy import SIC, LEX, NORM, SHAPE, LAST3 +from spacy import LEX, NORM, SHAPE, LAST3 def test_group_by_lex(): diff --git a/tests/test_orth.py b/tests/test_orth.py index f13fa90bf..503394916 100644 --- a/tests/test_orth.py +++ b/tests/test_orth.py @@ -4,7 +4,7 @@ import pytest from spacy.en import lookup, unhash -from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of, length_of +from spacy.lexeme import lex_of, norm_of, shape_of, first_of, length_of from spacy.lexeme import shape_of @pytest.fixture diff --git a/tests/test_post_punct.py b/tests/test_post_punct.py index 0138819db..f8391235a 100644 --- a/tests/test_post_punct.py +++ b/tests/test_post_punct.py @@ -1,8 +1,8 @@ from __future__ import unicode_literals from spacy import lex_of -from spacy.spacy import expand_chunk from spacy.en import lookup +from spacy.en import tokenize from spacy.en import unhash import pytest @@ -17,8 +17,7 @@ def test_close(close_puncts): word_str = 'Hello' for p in close_puncts: string = word_str + p - token = lookup(string) - tokens = expand_chunk(token) + tokens = tokenize(string) assert len(tokens) == 2 assert unhash(lex_of(tokens[1])) == p assert unhash(lex_of(tokens[0])) == word_str @@ -28,9 +27,7 @@ def test_two_different_close(close_puncts): word_str = 'Hello' for p in close_puncts: string = word_str + p + "'" - token = lookup(string) - assert unhash(lex_of(token)) == word_str - tokens = expand_chunk(token) + tokens = tokenize(string) assert len(tokens) == 3 assert unhash(lex_of(tokens[0])) == word_str assert unhash(lex_of(tokens[1])) == p @@ -41,7 +38,7 @@ def test_three_same_close(close_puncts): word_str = 'Hello' for p in close_puncts: string = word_str + p + p + p - tokens = expand_chunk(lookup(string)) + tokens = tokenize(string) assert len(tokens) == 4 assert unhash(lex_of(tokens[0])) == word_str assert unhash(lex_of(tokens[1])) == p diff --git a/tests/test_pre_punct.py b/tests/test_pre_punct.py index d1cd10bf6..5a4a4d072 100644 --- a/tests/test_pre_punct.py +++ b/tests/test_pre_punct.py @@ -1,8 +1,8 @@ from __future__ import unicode_literals from spacy import lex_of -from spacy.spacy import expand_chunk from spacy.en import lookup +from spacy.en import tokenize from spacy.en import unhash import pytest @@ -17,9 +17,7 @@ def test_open(open_puncts): word_str = 'Hello' for p in open_puncts: string = p + word_str - token = lookup(string) - assert unhash(lex_of(token)) == p - tokens = expand_chunk(token) + tokens = tokenize(string) assert len(tokens) == 2 assert unhash(lex_of(tokens[0])) == p assert unhash(lex_of(tokens[1])) == word_str @@ -29,9 +27,7 @@ def test_two_different_open(open_puncts): word_str = 'Hello' for p in open_puncts: string = p + "`" + word_str - token = lookup(string) - assert unhash(lex_of(token)) == p - tokens = expand_chunk(token) + tokens = tokenize(string) assert len(tokens) == 3 assert unhash(lex_of(tokens[0])) == p assert unhash(lex_of(tokens[1])) == "`" @@ -42,9 +38,7 @@ def test_three_same_open(open_puncts): word_str = 'Hello' for p in open_puncts: string = p + p + p + word_str - token = lookup(string) - assert unhash(lex_of(token)) == p - tokens = expand_chunk(token) + tokens = tokenize(string) assert len(tokens) == 4 assert unhash(lex_of(tokens[0])) == p assert unhash(lex_of(tokens[3])) == word_str @@ -52,6 +46,6 @@ def test_three_same_open(open_puncts): def test_open_appostrophe(): string = "'The" - tokens = expand_chunk(lookup(string)) + tokens = tokenize(string) assert len(tokens) == 2 assert unhash(lex_of(tokens[0])) == "'" diff --git a/tests/test_rules.py b/tests/test_rules.py index f95f1f820..b19a1c3f1 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -5,7 +5,7 @@ def test_load_en(): rules = util.read_tokenization('en') assert len(rules) != 0 aint = [rule for rule in rules if rule[0] == "ain't"][0] - chunk, lex, pieces = aint + chunk, pieces = aint assert chunk == "ain't" - assert lex == "are" - assert pieces == ["not"] + assert pieces[0] == "are" + assert pieces[1] == "not" diff --git a/tests/test_surround_punct.py b/tests/test_surround_punct.py index bef9cc83a..2c3a7f837 100644 --- a/tests/test_surround_punct.py +++ b/tests/test_surround_punct.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals -from spacy import lex_of, sic_of -from spacy.spacy import expand_chunk +from spacy import lex_of +from spacy.en import tokenize from spacy.en import lookup from spacy.en import unhash @@ -17,19 +17,18 @@ def test_token(paired_puncts): word_str = 'Hello' for open_, close_ in paired_puncts: string = open_ + word_str + close_ - tokens = expand_chunk(lookup(string)) + tokens = tokenize(string) assert len(tokens) == 3 assert unhash(lex_of(tokens[0])) == open_ assert unhash(lex_of(tokens[1])) == word_str assert unhash(lex_of(tokens[2])) == close_ - assert unhash(sic_of(tokens[0])) == string def test_two_different(paired_puncts): word_str = 'Hello' for open_, close_ in paired_puncts: string = "`" + open_ + word_str + close_ + "'" - tokens = expand_chunk(lookup(string)) + tokens = tokenize(string) assert len(tokens) == 5 assert unhash(lex_of(tokens[0])) == "`" assert unhash(lex_of(tokens[1])) == open_ diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index c3760c6fb..a0dbdc129 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -19,15 +19,12 @@ def test_two_words(): def test_punct(): - lex_ids = tokenize('hello, possums.') - assert len(lex_ids) == 4 - assert lex_ids[0] != lookup('hello') - assert lex_of(lex_ids[0]) == lex_of(lookup('hello')) - assert lex_ids[2] == lookup('possums.') - assert lex_of(lex_ids[2]) == lex_of(lookup('possums.')) - assert lex_of(lex_ids[2]) == lex_of(lookup('possums')) - assert lex_of(lex_ids[1]) != lex_of(lookup('hello')) - assert lex_ids[0] != lookup('hello.') + tokens = tokenize('hello, possums.') + assert len(tokens) == 4 + assert lex_of(tokens[0]) == lex_of(lookup('hello')) + assert lex_of(tokens[1]) == lex_of(lookup(',')) + assert lex_of(tokens[2]) == lex_of(lookup('possums')) + assert lex_of(tokens[1]) != lex_of(lookup('hello')) def test_digits():