* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word.

This commit is contained in:
Matthew Honnibal 2014-08-18 19:14:00 +02:00
parent b94c9b72c9
commit 01469b0888
20 changed files with 123 additions and 212 deletions

View File

@ -39,29 +39,20 @@ cython_includes = ['.']
if 'VIRTUAL_ENV' in os.environ: if 'VIRTUAL_ENV' in os.environ:
includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'include', 'site', '*')) includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'include', 'site', '*'))
cython_includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'lib', '*'))
else: else:
# If you're not using virtualenv, set your include dir here. # If you're not using virtualenv, set your include dir here.
pass pass
exts = [ exts = [
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
Extension("spacy.en", ["spacy/en.pyx"], language="c++", Extension("spacy.en", ["spacy/en.pyx"], language="c++",
include_dirs=includes, cython_include_dirs=cython_includes), include_dirs=includes),
Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes, Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes),
cython_include_dirs=cython_includes), Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes, Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
cython_include_dirs=cython_includes),
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes,
cython_include_dirs=cython_includes),
Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes,
cython_include_dirs=cython_includes),
Extension("spacy.chartree", ["spacy/chartree.pyx"], language="c++", include_dirs=includes,
cython_include_dirs=cython_includes),
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes,
cython_include_dirs=cython_includes),
Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++", Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
include_dirs=includes, cython_include_dirs=cython_includes), include_dirs=includes),
] ]

View File

@ -1,16 +1,14 @@
from .lexeme import lex_of from .lexeme import lex_of
from .lexeme import sic_of
from .lexeme import length_of from .lexeme import length_of
from .tokens import Tokens from .tokens import Tokens
# Don't know how to get the enum Python visible :( # Don't know how to get the enum Python visible :(
SIC = 0 LEX = 0
LEX = 1 NORM = 1
NORM = 2 SHAPE = 2
SHAPE = 3 LAST3 = 3
LAST3 = 4 LENGTH = 4
LENGTH = 5
__all__ = [Tokens, lex_of, sic_of, length_of, SIC, LEX, NORM, SHAPE, LAST3, LENGTH] __all__ = [Tokens, lex_of, length_of, LEX, NORM, SHAPE, LAST3, LENGTH]

View File

@ -9,7 +9,7 @@ from spacy.tokens cimport Tokens
cdef class English(spacy.Language): cdef class English(spacy.Language):
cdef int find_split(self, unicode word, size_t length) cdef int find_split(self, unicode word)
cdef English EN cdef English EN

View File

@ -17,10 +17,13 @@ cimport spacy
cdef class English(spacy.Language): cdef class English(spacy.Language):
cdef int find_split(self, unicode word, size_t length): cdef int find_split(self, unicode word):
cdef size_t length = len(word)
cdef int i = 0 cdef int i = 0
if word.startswith("'s") or word.startswith("'S"):
return 2
# Contractions # Contractions
if word.endswith("'s"): if word.endswith("'s") and length >= 3:
return length - 2 return length - 2
# Leading punctuation # Leading punctuation
if is_punct(word, 0, length): if is_punct(word, 0, length):
@ -36,7 +39,6 @@ cdef class English(spacy.Language):
cdef bint is_punct(unicode word, size_t i, size_t length): cdef bint is_punct(unicode word, size_t i, size_t length):
# Don't count appostrophes as punct if the next char is a letter # Don't count appostrophes as punct if the next char is a letter
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
# ...Unless we're at 0
return i == 0 return i == 0
if word[i] == "-" and i < (length - 1) and word[i+1] == '-': if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
return False return False
@ -57,7 +59,7 @@ cpdef Tokens tokenize(unicode string):
cpdef Lexeme_addr lookup(unicode string) except 0: cpdef Lexeme_addr lookup(unicode string) except 0:
return EN.lookup_chunk(string) return <Lexeme_addr>EN.lookup(string)
cpdef unicode unhash(StringHash hash_value): cpdef unicode unhash(StringHash hash_value):

View File

@ -8,7 +8,7 @@ from spacy.tokens cimport Tokens
cdef class EnglishPTB(Language): cdef class EnglishPTB(Language):
cdef int find_split(self, unicode word, size_t length) cdef int find_split(self, unicode word)
cdef EnglishPTB EN_PTB cdef EnglishPTB EN_PTB

View File

@ -17,7 +17,8 @@ cimport spacy
cdef class EnglishPTB(Language): cdef class EnglishPTB(Language):
cdef int find_split(self, unicode word, size_t length): cdef int find_split(self, unicode word):
length = len(word)
cdef int i = 0 cdef int i = 0
# Contractions # Contractions
if word.endswith("'s"): if word.endswith("'s"):
@ -53,7 +54,7 @@ cpdef Tokens tokenize(unicode string):
cpdef Lexeme_addr lookup(unicode string) except 0: cpdef Lexeme_addr lookup(unicode string) except 0:
return EN_PTB.lookup_chunk(string) return <Lexeme_addr>EN_PTB.lookup_chunk(string)
cpdef unicode unhash(StringHash hash_value): cpdef unicode unhash(StringHash hash_value):

View File

@ -32,14 +32,13 @@ cdef struct Lexeme:
Distribution* dist # Distribution info, lazy loaded Distribution* dist # Distribution info, lazy loaded
Orthography* orth # Extra orthographic views Orthography* orth # Extra orthographic views
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens #Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL) cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
cdef enum StringAttr: cdef enum StringAttr:
SIC
LEX LEX
NORM NORM
SHAPE SHAPE
@ -49,7 +48,6 @@ cdef enum StringAttr:
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
cpdef StringHash sic_of(size_t lex_id) except 0
cpdef StringHash lex_of(size_t lex_id) except 0 cpdef StringHash lex_of(size_t lex_id) except 0
cpdef StringHash norm_of(size_t lex_id) except 0 cpdef StringHash norm_of(size_t lex_id) except 0
cpdef StringHash shape_of(size_t lex_id) except 0 cpdef StringHash shape_of(size_t lex_id) except 0

View File

@ -22,9 +22,7 @@ from spacy.spacy cimport StringHash
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0: cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
if attr == SIC: if attr == LEX:
return sic_of(lex_id)
elif attr == LEX:
return lex_of(lex_id) return lex_of(lex_id)
elif attr == NORM: elif attr == NORM:
return norm_of(lex_id) return norm_of(lex_id)
@ -38,18 +36,6 @@ cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
raise StandardError raise StandardError
cpdef StringHash sic_of(size_t lex_id) except 0:
'''Access the `sic' field of the Lexeme pointed to by lex_id.
The sic field stores the hash of the whitespace-delimited string-chunk used to
construct the Lexeme.
>>> [unhash(sic_of(lex_id)) for lex_id in from_string(u'Hi! world')]
[u'Hi!', u'', u'world]
'''
return (<Lexeme*>lex_id).sic
cpdef StringHash lex_of(size_t lex_id) except 0: cpdef StringHash lex_of(size_t lex_id) except 0:
'''Access the `lex' field of the Lexeme pointed to by lex_id. '''Access the `lex' field of the Lexeme pointed to by lex_id.

View File

@ -3,8 +3,6 @@ from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from sparsehash.dense_hash_map cimport dense_hash_map from sparsehash.dense_hash_map cimport dense_hash_map
from _hashing cimport FixedTable
from _hashing cimport WordTree
# Circular import problems here # Circular import problems here
ctypedef size_t Lexeme_addr ctypedef size_t Lexeme_addr
@ -28,22 +26,21 @@ from spacy._hashing cimport WordTree
cdef class Language: cdef class Language:
cdef object name cdef object name
cdef WordTree vocab cdef dense_hash_map[StringHash, size_t] chunks
cdef WordTree distri cdef dense_hash_map[StringHash, size_t] vocab
cdef WordTree ortho
cdef dict bacov cdef dict bacov
cpdef Tokens tokenize(self, unicode text) cdef Tokens tokenize(self, unicode text)
cdef Lexeme_addr lookup(self, unicode string) except 0 cdef Lexeme* lookup(self, unicode string) except NULL
cdef Lexeme_addr lookup_chunk(self, unicode string) except 0 cdef Lexeme** lookup_chunk(self, unicode string) except NULL
cdef Orthography* lookup_orth(self, unicode lex) except NULL
cdef Distribution* lookup_dist(self, unicode lex) except NULL
cdef Lexeme* new_lexeme(self, unicode key, unicode lex) except NULL cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
cdef Lexeme* new_lexeme(self, unicode lex) except NULL
cdef Orthography* new_orth(self, unicode lex) except NULL cdef Orthography* new_orth(self, unicode lex) except NULL
cdef Distribution* new_dist(self, unicode lex) except NULL cdef Distribution* new_dist(self, unicode lex) except NULL
cdef unicode unhash(self, StringHash hashed) cdef unicode unhash(self, StringHash hashed)
cdef int find_split(self, unicode word, size_t length) cpdef list find_substrings(self, unicode word)
cdef int find_split(self, unicode word)

View File

@ -5,7 +5,6 @@ from libc.stdlib cimport calloc, free
from libcpp.pair cimport pair from libcpp.pair cimport pair
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from murmurhash cimport mrmr
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport BLANK_WORD from spacy.lexeme cimport BLANK_WORD
@ -64,86 +63,56 @@ cdef class Language:
def __cinit__(self, name): def __cinit__(self, name):
self.name = name self.name = name
self.bacov = {} self.bacov = {}
self.vocab = WordTree(0, 5) self.chunks = dense_hash_map[StringHash, size_t]()
self.ortho = WordTree(0, 5) self.vocab = dense_hash_map[StringHash, size_t]()
self.distri = WordTree(0, 5) self.chunks.set_empty_key(0)
self.vocab.set_empty_key(0)
self.load_tokenization(util.read_tokenization(name)) self.load_tokenization(util.read_tokenization(name))
cpdef Tokens tokenize(self, unicode characters): cdef Tokens tokenize(self, unicode characters):
cdef size_t i = 0 cdef size_t i = 0
cdef size_t start = 0 cdef size_t start = 0
cdef Lexeme** chunk
cdef Tokens tokens = Tokens(self) cdef Tokens tokens = Tokens(self)
cdef Lexeme* token for chunk_str in characters.split():
for c in characters: chunk = self.lookup_chunk(chunk_str)
if _is_whitespace(c): i = 0
if start < i: while chunk[i] != NULL:
token = <Lexeme*>self.lookup_chunk(characters[start:i]) tokens.append(<Lexeme_addr>chunk[i])
while token != NULL:
tokens.append(<Lexeme_addr>token)
token = token.tail
start = i + 1
i += 1 i += 1
if start < i:
token = <Lexeme*>self.lookup_chunk(characters[start:])
while token != NULL:
tokens.append(<Lexeme_addr>token)
token = token.tail
return tokens return tokens
cdef Lexeme_addr lookup(self, unicode string) except 0: cdef Lexeme* lookup(self, unicode string) except NULL:
cdef size_t length = len(string) if len(string) == 0:
if length == 0: return &BLANK_WORD
return <Lexeme_addr>&BLANK_WORD cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
if word == NULL:
word = self.new_lexeme(string)
return word
cdef StringHash hashed = hash(string) cdef Lexeme** lookup_chunk(self, unicode string) except NULL:
# First, check words seen 2+ times assert len(string) != 0
cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string) cdef Lexeme** chunk = <Lexeme**>self.chunks[hash(string)]
if word_ptr == NULL:
word_ptr = self.new_lexeme(string, string)
return <Lexeme_addr>word_ptr
cdef Lexeme_addr lookup_chunk(self, unicode string) except 0:
'''Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, splitting off any attached punctuation or clitics. A
reference to BLANK_WORD is returned for the empty string.
'''
cdef size_t length = len(string)
if length == 0:
return <Lexeme_addr>&BLANK_WORD
# First, check words seen 2+ times
cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string)
cdef int split cdef int split
if word_ptr == NULL: if chunk == NULL:
split = self.find_split(string, length) chunk = self.new_chunk(string, self.find_substrings(string))
if split != 0 and split != -1 and split < length: return chunk
word_ptr = self.new_lexeme(string, string[:split])
word_ptr.tail = <Lexeme*>self.lookup_chunk(string[split:])
else:
word_ptr = self.new_lexeme(string, string)
return <Lexeme_addr>word_ptr
cdef Orthography* lookup_orth(self, unicode lex): cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL:
cdef Orthography* orth = <Orthography*>self.ortho.get(lex) cdef Lexeme** chunk = <Lexeme**>calloc(len(substrings) + 1, sizeof(Lexeme*))
if orth == NULL: for i, substring in enumerate(substrings):
orth = self.new_orth(lex) chunk[i] = self.lookup(substring)
return orth chunk[i + 1] = NULL
self.chunks[hash(string)] = <size_t>chunk
return chunk
cdef Distribution* lookup_dist(self, unicode lex): cdef Lexeme* new_lexeme(self, unicode string) except NULL:
cdef Distribution* dist = <Distribution*>self.distri.get(lex)
if dist == NULL:
dist = self.new_dist(lex)
return dist
cdef Lexeme* new_lexeme(self, unicode key, unicode string) except NULL:
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme)) cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
word.sic = hash(key)
word.lex = hash(string) word.lex = hash(string)
self.bacov[word.lex] = string self.bacov[word.lex] = string
self.bacov[word.sic] = key word.orth = self.new_orth(string)
word.orth = self.lookup_orth(string) word.dist = self.new_dist(string)
word.dist = self.lookup_dist(string) self.vocab[word.lex] = <size_t>word
self.vocab.set(key, <size_t>word)
return word return word
cdef Orthography* new_orth(self, unicode lex) except NULL: cdef Orthography* new_orth(self, unicode lex) except NULL:
@ -170,30 +139,33 @@ cdef class Language:
self.bacov[orth.norm] = norm self.bacov[orth.norm] = norm
self.bacov[orth.shape] = shape self.bacov[orth.shape] = shape
self.ortho.set(lex, <size_t>orth)
return orth return orth
cdef Distribution* new_dist(self, unicode lex) except NULL: cdef Distribution* new_dist(self, unicode lex) except NULL:
dist = <Distribution*>calloc(1, sizeof(Distribution)) dist = <Distribution*>calloc(1, sizeof(Distribution))
self.distri.set(lex, <size_t>dist)
return dist return dist
cdef unicode unhash(self, StringHash hash_value): cdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.''' '''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value] return self.bacov[hash_value]
cdef int find_split(self, unicode word, size_t length): cpdef list find_substrings(self, unicode word):
return -1 substrings = []
while word:
split = self.find_split(word)
if split == 0:
substrings.append(word)
break
substrings.append(word[:split])
word = word[split:]
return substrings
cdef int find_split(self, unicode word):
return len(word)
def load_tokenization(self, token_rules=None): def load_tokenization(self, token_rules=None):
cdef Lexeme* word for chunk, tokens in token_rules:
cdef StringHash hashed self.new_chunk(chunk, tokens)
for chunk, lex, tokens in token_rules:
word = <Lexeme*>self.new_lexeme(chunk, lex)
for i, lex in enumerate(tokens):
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
word.tail = <Lexeme*>self.new_lexeme(token_string, lex)
word = word.tail
def load_clusters(self): def load_clusters(self):
cdef Lexeme* w cdef Lexeme* w
@ -209,24 +181,4 @@ cdef class Language:
# the first 4 bits. See redshift._parse_features.pyx # the first 4 bits. See redshift._parse_features.pyx
cluster = int(cluster_str[::-1], 2) cluster = int(cluster_str[::-1], 2)
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0)) upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
word = self.new_lexeme(token_string, token_string) self.new_lexeme(token_string)
cdef inline bint _is_whitespace(unsigned char c) nogil:
if c == b' ':
return True
elif c == b'\n':
return True
elif c == b'\t':
return True
else:
return False
cpdef vector[size_t] expand_chunk(size_t addr) except *:
cdef vector[size_t] tokens = vector[size_t]()
word = <Lexeme*>addr
while word != NULL:
tokens.push_back(<size_t>word)
word = word.tail
return tokens

View File

@ -1,5 +1,6 @@
from libcpp.vector cimport vector from libcpp.vector cimport vector
from spacy.spacy cimport Lexeme_addr from spacy.spacy cimport Lexeme_addr
from spacy.lexeme cimport Lexeme
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from spacy.spacy cimport Language from spacy.spacy cimport Language

View File

@ -32,13 +32,12 @@ def read_tokenization(lang):
continue continue
pieces = line.split() pieces = line.split()
chunk = pieces.pop(0) chunk = pieces.pop(0)
lex = pieces.pop(0)
assert chunk not in seen, chunk assert chunk not in seen, chunk
seen.add(chunk) seen.add(chunk)
entries.append((chunk, lex, pieces)) entries.append((chunk, list(pieces)))
if chunk[0].isalpha() and chunk[0].islower(): if chunk[0].isalpha() and chunk[0].islower():
chunk = chunk[0].title() + chunk[1:] chunk = chunk[0].title() + chunk[1:]
lex = lex[0].title() + lex[1:] pieces[0] = pieces[0][0].title() + pieces[0][1:]
seen.add(chunk) seen.add(chunk)
entries.append((chunk, lex, pieces)) entries.append((chunk, pieces))
return entries return entries

View File

@ -1,44 +1,43 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.spacy import expand_chunk from spacy.en import tokenize, lookup, unhash
from spacy.en import lookup, unhash
from spacy import lex_of from spacy import lex_of
def test_possess(): def test_possess():
tokens = expand_chunk(lookup("Mike's")) tokens = tokenize("Mike's")
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "Mike" assert unhash(lex_of(tokens[0])) == "Mike"
assert unhash(lex_of(tokens[1])) == "'s" assert unhash(lex_of(tokens[1])) == "'s"
assert len(tokens) == 2
def test_apostrophe(): def test_apostrophe():
tokens = expand_chunk(lookup("schools'")) tokens = tokenize("schools'")
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[1])) == "'" assert unhash(lex_of(tokens[1])) == "'"
assert unhash(lex_of(tokens[0])) == "schools" assert unhash(lex_of(tokens[0])) == "schools"
def test_LL(): def test_LL():
tokens = expand_chunk(lookup("we'll")) tokens = tokenize("we'll")
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[1])) == "will" assert unhash(lex_of(tokens[1])) == "will"
assert unhash(lex_of(tokens[0])) == "we" assert unhash(lex_of(tokens[0])) == "we"
def test_aint(): def test_aint():
tokens = expand_chunk(lookup("ain't")) tokens = tokenize("ain't")
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "are" assert unhash(lex_of(tokens[0])) == "are"
assert unhash(lex_of(tokens[1])) == "not" assert unhash(lex_of(tokens[1])) == "not"
def test_capitalized(): def test_capitalized():
tokens = expand_chunk(lookup("can't")) tokens = tokenize("can't")
assert len(tokens) == 2 assert len(tokens) == 2
tokens = expand_chunk(lookup("Can't")) tokens = tokenize("Can't")
assert len(tokens) == 2 assert len(tokens) == 2
tokens = expand_chunk(lookup("Ain't")) tokens = tokenize("Ain't")
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "Are" assert unhash(lex_of(tokens[0])) == "Are"

View File

@ -5,7 +5,7 @@ import pytest
from spacy import en from spacy import en
from spacy.lexeme import lex_of from spacy.lexeme import lex_of
from spacy import SIC, LEX, NORM, SHAPE, LAST3 from spacy import LEX, NORM, SHAPE, LAST3
def test_group_by_lex(): def test_group_by_lex():

View File

@ -4,7 +4,7 @@ import pytest
from spacy.en import lookup, unhash from spacy.en import lookup, unhash
from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of, length_of from spacy.lexeme import lex_of, norm_of, shape_of, first_of, length_of
from spacy.lexeme import shape_of from spacy.lexeme import shape_of
@pytest.fixture @pytest.fixture

View File

@ -1,8 +1,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy import lex_of from spacy import lex_of
from spacy.spacy import expand_chunk
from spacy.en import lookup from spacy.en import lookup
from spacy.en import tokenize
from spacy.en import unhash from spacy.en import unhash
import pytest import pytest
@ -17,8 +17,7 @@ def test_close(close_puncts):
word_str = 'Hello' word_str = 'Hello'
for p in close_puncts: for p in close_puncts:
string = word_str + p string = word_str + p
token = lookup(string) tokens = tokenize(string)
tokens = expand_chunk(token)
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[1])) == p assert unhash(lex_of(tokens[1])) == p
assert unhash(lex_of(tokens[0])) == word_str assert unhash(lex_of(tokens[0])) == word_str
@ -28,9 +27,7 @@ def test_two_different_close(close_puncts):
word_str = 'Hello' word_str = 'Hello'
for p in close_puncts: for p in close_puncts:
string = word_str + p + "'" string = word_str + p + "'"
token = lookup(string) tokens = tokenize(string)
assert unhash(lex_of(token)) == word_str
tokens = expand_chunk(token)
assert len(tokens) == 3 assert len(tokens) == 3
assert unhash(lex_of(tokens[0])) == word_str assert unhash(lex_of(tokens[0])) == word_str
assert unhash(lex_of(tokens[1])) == p assert unhash(lex_of(tokens[1])) == p
@ -41,7 +38,7 @@ def test_three_same_close(close_puncts):
word_str = 'Hello' word_str = 'Hello'
for p in close_puncts: for p in close_puncts:
string = word_str + p + p + p string = word_str + p + p + p
tokens = expand_chunk(lookup(string)) tokens = tokenize(string)
assert len(tokens) == 4 assert len(tokens) == 4
assert unhash(lex_of(tokens[0])) == word_str assert unhash(lex_of(tokens[0])) == word_str
assert unhash(lex_of(tokens[1])) == p assert unhash(lex_of(tokens[1])) == p

View File

@ -1,8 +1,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy import lex_of from spacy import lex_of
from spacy.spacy import expand_chunk
from spacy.en import lookup from spacy.en import lookup
from spacy.en import tokenize
from spacy.en import unhash from spacy.en import unhash
import pytest import pytest
@ -17,9 +17,7 @@ def test_open(open_puncts):
word_str = 'Hello' word_str = 'Hello'
for p in open_puncts: for p in open_puncts:
string = p + word_str string = p + word_str
token = lookup(string) tokens = tokenize(string)
assert unhash(lex_of(token)) == p
tokens = expand_chunk(token)
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == p assert unhash(lex_of(tokens[0])) == p
assert unhash(lex_of(tokens[1])) == word_str assert unhash(lex_of(tokens[1])) == word_str
@ -29,9 +27,7 @@ def test_two_different_open(open_puncts):
word_str = 'Hello' word_str = 'Hello'
for p in open_puncts: for p in open_puncts:
string = p + "`" + word_str string = p + "`" + word_str
token = lookup(string) tokens = tokenize(string)
assert unhash(lex_of(token)) == p
tokens = expand_chunk(token)
assert len(tokens) == 3 assert len(tokens) == 3
assert unhash(lex_of(tokens[0])) == p assert unhash(lex_of(tokens[0])) == p
assert unhash(lex_of(tokens[1])) == "`" assert unhash(lex_of(tokens[1])) == "`"
@ -42,9 +38,7 @@ def test_three_same_open(open_puncts):
word_str = 'Hello' word_str = 'Hello'
for p in open_puncts: for p in open_puncts:
string = p + p + p + word_str string = p + p + p + word_str
token = lookup(string) tokens = tokenize(string)
assert unhash(lex_of(token)) == p
tokens = expand_chunk(token)
assert len(tokens) == 4 assert len(tokens) == 4
assert unhash(lex_of(tokens[0])) == p assert unhash(lex_of(tokens[0])) == p
assert unhash(lex_of(tokens[3])) == word_str assert unhash(lex_of(tokens[3])) == word_str
@ -52,6 +46,6 @@ def test_three_same_open(open_puncts):
def test_open_appostrophe(): def test_open_appostrophe():
string = "'The" string = "'The"
tokens = expand_chunk(lookup(string)) tokens = tokenize(string)
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "'" assert unhash(lex_of(tokens[0])) == "'"

View File

@ -5,7 +5,7 @@ def test_load_en():
rules = util.read_tokenization('en') rules = util.read_tokenization('en')
assert len(rules) != 0 assert len(rules) != 0
aint = [rule for rule in rules if rule[0] == "ain't"][0] aint = [rule for rule in rules if rule[0] == "ain't"][0]
chunk, lex, pieces = aint chunk, pieces = aint
assert chunk == "ain't" assert chunk == "ain't"
assert lex == "are" assert pieces[0] == "are"
assert pieces == ["not"] assert pieces[1] == "not"

View File

@ -1,7 +1,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy import lex_of, sic_of from spacy import lex_of
from spacy.spacy import expand_chunk from spacy.en import tokenize
from spacy.en import lookup from spacy.en import lookup
from spacy.en import unhash from spacy.en import unhash
@ -17,19 +17,18 @@ def test_token(paired_puncts):
word_str = 'Hello' word_str = 'Hello'
for open_, close_ in paired_puncts: for open_, close_ in paired_puncts:
string = open_ + word_str + close_ string = open_ + word_str + close_
tokens = expand_chunk(lookup(string)) tokens = tokenize(string)
assert len(tokens) == 3 assert len(tokens) == 3
assert unhash(lex_of(tokens[0])) == open_ assert unhash(lex_of(tokens[0])) == open_
assert unhash(lex_of(tokens[1])) == word_str assert unhash(lex_of(tokens[1])) == word_str
assert unhash(lex_of(tokens[2])) == close_ assert unhash(lex_of(tokens[2])) == close_
assert unhash(sic_of(tokens[0])) == string
def test_two_different(paired_puncts): def test_two_different(paired_puncts):
word_str = 'Hello' word_str = 'Hello'
for open_, close_ in paired_puncts: for open_, close_ in paired_puncts:
string = "`" + open_ + word_str + close_ + "'" string = "`" + open_ + word_str + close_ + "'"
tokens = expand_chunk(lookup(string)) tokens = tokenize(string)
assert len(tokens) == 5 assert len(tokens) == 5
assert unhash(lex_of(tokens[0])) == "`" assert unhash(lex_of(tokens[0])) == "`"
assert unhash(lex_of(tokens[1])) == open_ assert unhash(lex_of(tokens[1])) == open_

View File

@ -19,15 +19,12 @@ def test_two_words():
def test_punct(): def test_punct():
lex_ids = tokenize('hello, possums.') tokens = tokenize('hello, possums.')
assert len(lex_ids) == 4 assert len(tokens) == 4
assert lex_ids[0] != lookup('hello') assert lex_of(tokens[0]) == lex_of(lookup('hello'))
assert lex_of(lex_ids[0]) == lex_of(lookup('hello')) assert lex_of(tokens[1]) == lex_of(lookup(','))
assert lex_ids[2] == lookup('possums.') assert lex_of(tokens[2]) == lex_of(lookup('possums'))
assert lex_of(lex_ids[2]) == lex_of(lookup('possums.')) assert lex_of(tokens[1]) != lex_of(lookup('hello'))
assert lex_of(lex_ids[2]) == lex_of(lookup('possums'))
assert lex_of(lex_ids[1]) != lex_of(lookup('hello'))
assert lex_ids[0] != lookup('hello.')
def test_digits(): def test_digits():