mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word.
This commit is contained in:
parent
b94c9b72c9
commit
01469b0888
21
setup.py
21
setup.py
|
@ -39,29 +39,20 @@ cython_includes = ['.']
|
||||||
|
|
||||||
if 'VIRTUAL_ENV' in os.environ:
|
if 'VIRTUAL_ENV' in os.environ:
|
||||||
includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'include', 'site', '*'))
|
includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'include', 'site', '*'))
|
||||||
cython_includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'lib', '*'))
|
|
||||||
else:
|
else:
|
||||||
# If you're not using virtualenv, set your include dir here.
|
# If you're not using virtualenv, set your include dir here.
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
exts = [
|
exts = [
|
||||||
|
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
|
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
|
||||||
include_dirs=includes, cython_include_dirs=cython_includes),
|
include_dirs=includes),
|
||||||
Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes,
|
Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes),
|
||||||
cython_include_dirs=cython_includes),
|
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes,
|
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
|
||||||
cython_include_dirs=cython_includes),
|
|
||||||
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes,
|
|
||||||
cython_include_dirs=cython_includes),
|
|
||||||
Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes,
|
|
||||||
cython_include_dirs=cython_includes),
|
|
||||||
Extension("spacy.chartree", ["spacy/chartree.pyx"], language="c++", include_dirs=includes,
|
|
||||||
cython_include_dirs=cython_includes),
|
|
||||||
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes,
|
|
||||||
cython_include_dirs=cython_includes),
|
|
||||||
Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
|
Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
|
||||||
include_dirs=includes, cython_include_dirs=cython_includes),
|
include_dirs=includes),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,14 @@
|
||||||
from .lexeme import lex_of
|
from .lexeme import lex_of
|
||||||
from .lexeme import sic_of
|
|
||||||
from .lexeme import length_of
|
from .lexeme import length_of
|
||||||
|
|
||||||
from .tokens import Tokens
|
from .tokens import Tokens
|
||||||
|
|
||||||
# Don't know how to get the enum Python visible :(
|
# Don't know how to get the enum Python visible :(
|
||||||
|
|
||||||
SIC = 0
|
LEX = 0
|
||||||
LEX = 1
|
NORM = 1
|
||||||
NORM = 2
|
SHAPE = 2
|
||||||
SHAPE = 3
|
LAST3 = 3
|
||||||
LAST3 = 4
|
LENGTH = 4
|
||||||
LENGTH = 5
|
|
||||||
|
|
||||||
__all__ = [Tokens, lex_of, sic_of, length_of, SIC, LEX, NORM, SHAPE, LAST3, LENGTH]
|
__all__ = [Tokens, lex_of, length_of, LEX, NORM, SHAPE, LAST3, LENGTH]
|
||||||
|
|
|
@ -9,7 +9,7 @@ from spacy.tokens cimport Tokens
|
||||||
|
|
||||||
|
|
||||||
cdef class English(spacy.Language):
|
cdef class English(spacy.Language):
|
||||||
cdef int find_split(self, unicode word, size_t length)
|
cdef int find_split(self, unicode word)
|
||||||
|
|
||||||
cdef English EN
|
cdef English EN
|
||||||
|
|
||||||
|
|
10
spacy/en.pyx
10
spacy/en.pyx
|
@ -17,10 +17,13 @@ cimport spacy
|
||||||
|
|
||||||
|
|
||||||
cdef class English(spacy.Language):
|
cdef class English(spacy.Language):
|
||||||
cdef int find_split(self, unicode word, size_t length):
|
cdef int find_split(self, unicode word):
|
||||||
|
cdef size_t length = len(word)
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
|
if word.startswith("'s") or word.startswith("'S"):
|
||||||
|
return 2
|
||||||
# Contractions
|
# Contractions
|
||||||
if word.endswith("'s"):
|
if word.endswith("'s") and length >= 3:
|
||||||
return length - 2
|
return length - 2
|
||||||
# Leading punctuation
|
# Leading punctuation
|
||||||
if is_punct(word, 0, length):
|
if is_punct(word, 0, length):
|
||||||
|
@ -36,7 +39,6 @@ cdef class English(spacy.Language):
|
||||||
cdef bint is_punct(unicode word, size_t i, size_t length):
|
cdef bint is_punct(unicode word, size_t i, size_t length):
|
||||||
# Don't count appostrophes as punct if the next char is a letter
|
# Don't count appostrophes as punct if the next char is a letter
|
||||||
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
||||||
# ...Unless we're at 0
|
|
||||||
return i == 0
|
return i == 0
|
||||||
if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
|
if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
|
||||||
return False
|
return False
|
||||||
|
@ -57,7 +59,7 @@ cpdef Tokens tokenize(unicode string):
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme_addr lookup(unicode string) except 0:
|
cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||||
return EN.lookup_chunk(string)
|
return <Lexeme_addr>EN.lookup(string)
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode unhash(StringHash hash_value):
|
cpdef unicode unhash(StringHash hash_value):
|
||||||
|
|
|
@ -8,7 +8,7 @@ from spacy.tokens cimport Tokens
|
||||||
|
|
||||||
|
|
||||||
cdef class EnglishPTB(Language):
|
cdef class EnglishPTB(Language):
|
||||||
cdef int find_split(self, unicode word, size_t length)
|
cdef int find_split(self, unicode word)
|
||||||
|
|
||||||
|
|
||||||
cdef EnglishPTB EN_PTB
|
cdef EnglishPTB EN_PTB
|
||||||
|
|
|
@ -17,7 +17,8 @@ cimport spacy
|
||||||
|
|
||||||
|
|
||||||
cdef class EnglishPTB(Language):
|
cdef class EnglishPTB(Language):
|
||||||
cdef int find_split(self, unicode word, size_t length):
|
cdef int find_split(self, unicode word):
|
||||||
|
length = len(word)
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
# Contractions
|
# Contractions
|
||||||
if word.endswith("'s"):
|
if word.endswith("'s"):
|
||||||
|
@ -53,7 +54,7 @@ cpdef Tokens tokenize(unicode string):
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme_addr lookup(unicode string) except 0:
|
cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||||
return EN_PTB.lookup_chunk(string)
|
return <Lexeme_addr>EN_PTB.lookup_chunk(string)
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode unhash(StringHash hash_value):
|
cpdef unicode unhash(StringHash hash_value):
|
||||||
|
|
|
@ -32,14 +32,13 @@ cdef struct Lexeme:
|
||||||
|
|
||||||
Distribution* dist # Distribution info, lazy loaded
|
Distribution* dist # Distribution info, lazy loaded
|
||||||
Orthography* orth # Extra orthographic views
|
Orthography* orth # Extra orthographic views
|
||||||
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
|
#Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
|
cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
|
||||||
|
|
||||||
|
|
||||||
cdef enum StringAttr:
|
cdef enum StringAttr:
|
||||||
SIC
|
|
||||||
LEX
|
LEX
|
||||||
NORM
|
NORM
|
||||||
SHAPE
|
SHAPE
|
||||||
|
@ -49,7 +48,6 @@ cdef enum StringAttr:
|
||||||
|
|
||||||
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
|
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
|
||||||
|
|
||||||
cpdef StringHash sic_of(size_t lex_id) except 0
|
|
||||||
cpdef StringHash lex_of(size_t lex_id) except 0
|
cpdef StringHash lex_of(size_t lex_id) except 0
|
||||||
cpdef StringHash norm_of(size_t lex_id) except 0
|
cpdef StringHash norm_of(size_t lex_id) except 0
|
||||||
cpdef StringHash shape_of(size_t lex_id) except 0
|
cpdef StringHash shape_of(size_t lex_id) except 0
|
||||||
|
|
|
@ -22,9 +22,7 @@ from spacy.spacy cimport StringHash
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
|
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
|
||||||
if attr == SIC:
|
if attr == LEX:
|
||||||
return sic_of(lex_id)
|
|
||||||
elif attr == LEX:
|
|
||||||
return lex_of(lex_id)
|
return lex_of(lex_id)
|
||||||
elif attr == NORM:
|
elif attr == NORM:
|
||||||
return norm_of(lex_id)
|
return norm_of(lex_id)
|
||||||
|
@ -38,18 +36,6 @@ cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
|
||||||
raise StandardError
|
raise StandardError
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash sic_of(size_t lex_id) except 0:
|
|
||||||
'''Access the `sic' field of the Lexeme pointed to by lex_id.
|
|
||||||
|
|
||||||
The sic field stores the hash of the whitespace-delimited string-chunk used to
|
|
||||||
construct the Lexeme.
|
|
||||||
|
|
||||||
>>> [unhash(sic_of(lex_id)) for lex_id in from_string(u'Hi! world')]
|
|
||||||
[u'Hi!', u'', u'world]
|
|
||||||
'''
|
|
||||||
return (<Lexeme*>lex_id).sic
|
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash lex_of(size_t lex_id) except 0:
|
cpdef StringHash lex_of(size_t lex_id) except 0:
|
||||||
'''Access the `lex' field of the Lexeme pointed to by lex_id.
|
'''Access the `lex' field of the Lexeme pointed to by lex_id.
|
||||||
|
|
||||||
|
|
|
@ -3,8 +3,6 @@ from libc.stdint cimport uint32_t
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
from sparsehash.dense_hash_map cimport dense_hash_map
|
from sparsehash.dense_hash_map cimport dense_hash_map
|
||||||
from _hashing cimport FixedTable
|
|
||||||
from _hashing cimport WordTree
|
|
||||||
|
|
||||||
# Circular import problems here
|
# Circular import problems here
|
||||||
ctypedef size_t Lexeme_addr
|
ctypedef size_t Lexeme_addr
|
||||||
|
@ -28,22 +26,21 @@ from spacy._hashing cimport WordTree
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef object name
|
cdef object name
|
||||||
cdef WordTree vocab
|
cdef dense_hash_map[StringHash, size_t] chunks
|
||||||
cdef WordTree distri
|
cdef dense_hash_map[StringHash, size_t] vocab
|
||||||
cdef WordTree ortho
|
|
||||||
cdef dict bacov
|
cdef dict bacov
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cdef Tokens tokenize(self, unicode text)
|
||||||
|
|
||||||
cdef Lexeme_addr lookup(self, unicode string) except 0
|
cdef Lexeme* lookup(self, unicode string) except NULL
|
||||||
cdef Lexeme_addr lookup_chunk(self, unicode string) except 0
|
cdef Lexeme** lookup_chunk(self, unicode string) except NULL
|
||||||
cdef Orthography* lookup_orth(self, unicode lex) except NULL
|
|
||||||
cdef Distribution* lookup_dist(self, unicode lex) except NULL
|
|
||||||
|
|
||||||
cdef Lexeme* new_lexeme(self, unicode key, unicode lex) except NULL
|
cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
|
||||||
|
cdef Lexeme* new_lexeme(self, unicode lex) except NULL
|
||||||
cdef Orthography* new_orth(self, unicode lex) except NULL
|
cdef Orthography* new_orth(self, unicode lex) except NULL
|
||||||
cdef Distribution* new_dist(self, unicode lex) except NULL
|
cdef Distribution* new_dist(self, unicode lex) except NULL
|
||||||
|
|
||||||
cdef unicode unhash(self, StringHash hashed)
|
cdef unicode unhash(self, StringHash hashed)
|
||||||
|
|
||||||
cdef int find_split(self, unicode word, size_t length)
|
cpdef list find_substrings(self, unicode word)
|
||||||
|
cdef int find_split(self, unicode word)
|
||||||
|
|
152
spacy/spacy.pyx
152
spacy/spacy.pyx
|
@ -5,7 +5,6 @@ from libc.stdlib cimport calloc, free
|
||||||
from libcpp.pair cimport pair
|
from libcpp.pair cimport pair
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
|
|
||||||
from murmurhash cimport mrmr
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from spacy.lexeme cimport BLANK_WORD
|
from spacy.lexeme cimport BLANK_WORD
|
||||||
|
|
||||||
|
@ -64,86 +63,56 @@ cdef class Language:
|
||||||
def __cinit__(self, name):
|
def __cinit__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.bacov = {}
|
self.bacov = {}
|
||||||
self.vocab = WordTree(0, 5)
|
self.chunks = dense_hash_map[StringHash, size_t]()
|
||||||
self.ortho = WordTree(0, 5)
|
self.vocab = dense_hash_map[StringHash, size_t]()
|
||||||
self.distri = WordTree(0, 5)
|
self.chunks.set_empty_key(0)
|
||||||
|
self.vocab.set_empty_key(0)
|
||||||
self.load_tokenization(util.read_tokenization(name))
|
self.load_tokenization(util.read_tokenization(name))
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode characters):
|
cdef Tokens tokenize(self, unicode characters):
|
||||||
cdef size_t i = 0
|
cdef size_t i = 0
|
||||||
cdef size_t start = 0
|
cdef size_t start = 0
|
||||||
|
cdef Lexeme** chunk
|
||||||
cdef Tokens tokens = Tokens(self)
|
cdef Tokens tokens = Tokens(self)
|
||||||
cdef Lexeme* token
|
for chunk_str in characters.split():
|
||||||
for c in characters:
|
chunk = self.lookup_chunk(chunk_str)
|
||||||
if _is_whitespace(c):
|
i = 0
|
||||||
if start < i:
|
while chunk[i] != NULL:
|
||||||
token = <Lexeme*>self.lookup_chunk(characters[start:i])
|
tokens.append(<Lexeme_addr>chunk[i])
|
||||||
while token != NULL:
|
i += 1
|
||||||
tokens.append(<Lexeme_addr>token)
|
|
||||||
token = token.tail
|
|
||||||
start = i + 1
|
|
||||||
i += 1
|
|
||||||
if start < i:
|
|
||||||
token = <Lexeme*>self.lookup_chunk(characters[start:])
|
|
||||||
while token != NULL:
|
|
||||||
tokens.append(<Lexeme_addr>token)
|
|
||||||
token = token.tail
|
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef Lexeme_addr lookup(self, unicode string) except 0:
|
cdef Lexeme* lookup(self, unicode string) except NULL:
|
||||||
cdef size_t length = len(string)
|
if len(string) == 0:
|
||||||
if length == 0:
|
return &BLANK_WORD
|
||||||
return <Lexeme_addr>&BLANK_WORD
|
cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
|
||||||
|
if word == NULL:
|
||||||
|
word = self.new_lexeme(string)
|
||||||
|
return word
|
||||||
|
|
||||||
cdef StringHash hashed = hash(string)
|
cdef Lexeme** lookup_chunk(self, unicode string) except NULL:
|
||||||
# First, check words seen 2+ times
|
assert len(string) != 0
|
||||||
cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string)
|
cdef Lexeme** chunk = <Lexeme**>self.chunks[hash(string)]
|
||||||
if word_ptr == NULL:
|
|
||||||
word_ptr = self.new_lexeme(string, string)
|
|
||||||
return <Lexeme_addr>word_ptr
|
|
||||||
|
|
||||||
cdef Lexeme_addr lookup_chunk(self, unicode string) except 0:
|
|
||||||
'''Fetch a Lexeme representing a word string. If the word has not been seen,
|
|
||||||
construct one, splitting off any attached punctuation or clitics. A
|
|
||||||
reference to BLANK_WORD is returned for the empty string.
|
|
||||||
'''
|
|
||||||
cdef size_t length = len(string)
|
|
||||||
if length == 0:
|
|
||||||
return <Lexeme_addr>&BLANK_WORD
|
|
||||||
# First, check words seen 2+ times
|
|
||||||
cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string)
|
|
||||||
cdef int split
|
cdef int split
|
||||||
if word_ptr == NULL:
|
if chunk == NULL:
|
||||||
split = self.find_split(string, length)
|
chunk = self.new_chunk(string, self.find_substrings(string))
|
||||||
if split != 0 and split != -1 and split < length:
|
return chunk
|
||||||
word_ptr = self.new_lexeme(string, string[:split])
|
|
||||||
word_ptr.tail = <Lexeme*>self.lookup_chunk(string[split:])
|
|
||||||
else:
|
|
||||||
word_ptr = self.new_lexeme(string, string)
|
|
||||||
return <Lexeme_addr>word_ptr
|
|
||||||
|
|
||||||
cdef Orthography* lookup_orth(self, unicode lex):
|
cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL:
|
||||||
cdef Orthography* orth = <Orthography*>self.ortho.get(lex)
|
cdef Lexeme** chunk = <Lexeme**>calloc(len(substrings) + 1, sizeof(Lexeme*))
|
||||||
if orth == NULL:
|
for i, substring in enumerate(substrings):
|
||||||
orth = self.new_orth(lex)
|
chunk[i] = self.lookup(substring)
|
||||||
return orth
|
chunk[i + 1] = NULL
|
||||||
|
self.chunks[hash(string)] = <size_t>chunk
|
||||||
|
return chunk
|
||||||
|
|
||||||
cdef Distribution* lookup_dist(self, unicode lex):
|
cdef Lexeme* new_lexeme(self, unicode string) except NULL:
|
||||||
cdef Distribution* dist = <Distribution*>self.distri.get(lex)
|
|
||||||
if dist == NULL:
|
|
||||||
dist = self.new_dist(lex)
|
|
||||||
return dist
|
|
||||||
|
|
||||||
cdef Lexeme* new_lexeme(self, unicode key, unicode string) except NULL:
|
|
||||||
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
||||||
word.sic = hash(key)
|
|
||||||
word.lex = hash(string)
|
word.lex = hash(string)
|
||||||
self.bacov[word.lex] = string
|
self.bacov[word.lex] = string
|
||||||
self.bacov[word.sic] = key
|
word.orth = self.new_orth(string)
|
||||||
word.orth = self.lookup_orth(string)
|
word.dist = self.new_dist(string)
|
||||||
word.dist = self.lookup_dist(string)
|
self.vocab[word.lex] = <size_t>word
|
||||||
self.vocab.set(key, <size_t>word)
|
|
||||||
return word
|
return word
|
||||||
|
|
||||||
cdef Orthography* new_orth(self, unicode lex) except NULL:
|
cdef Orthography* new_orth(self, unicode lex) except NULL:
|
||||||
|
@ -170,30 +139,33 @@ cdef class Language:
|
||||||
self.bacov[orth.norm] = norm
|
self.bacov[orth.norm] = norm
|
||||||
self.bacov[orth.shape] = shape
|
self.bacov[orth.shape] = shape
|
||||||
|
|
||||||
self.ortho.set(lex, <size_t>orth)
|
|
||||||
return orth
|
return orth
|
||||||
|
|
||||||
cdef Distribution* new_dist(self, unicode lex) except NULL:
|
cdef Distribution* new_dist(self, unicode lex) except NULL:
|
||||||
dist = <Distribution*>calloc(1, sizeof(Distribution))
|
dist = <Distribution*>calloc(1, sizeof(Distribution))
|
||||||
self.distri.set(lex, <size_t>dist)
|
|
||||||
return dist
|
return dist
|
||||||
|
|
||||||
cdef unicode unhash(self, StringHash hash_value):
|
cdef unicode unhash(self, StringHash hash_value):
|
||||||
'''Fetch a string from the reverse index, given its hash value.'''
|
'''Fetch a string from the reverse index, given its hash value.'''
|
||||||
return self.bacov[hash_value]
|
return self.bacov[hash_value]
|
||||||
|
|
||||||
cdef int find_split(self, unicode word, size_t length):
|
cpdef list find_substrings(self, unicode word):
|
||||||
return -1
|
substrings = []
|
||||||
|
while word:
|
||||||
|
split = self.find_split(word)
|
||||||
|
if split == 0:
|
||||||
|
substrings.append(word)
|
||||||
|
break
|
||||||
|
substrings.append(word[:split])
|
||||||
|
word = word[split:]
|
||||||
|
return substrings
|
||||||
|
|
||||||
|
cdef int find_split(self, unicode word):
|
||||||
|
return len(word)
|
||||||
|
|
||||||
def load_tokenization(self, token_rules=None):
|
def load_tokenization(self, token_rules=None):
|
||||||
cdef Lexeme* word
|
for chunk, tokens in token_rules:
|
||||||
cdef StringHash hashed
|
self.new_chunk(chunk, tokens)
|
||||||
for chunk, lex, tokens in token_rules:
|
|
||||||
word = <Lexeme*>self.new_lexeme(chunk, lex)
|
|
||||||
for i, lex in enumerate(tokens):
|
|
||||||
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
|
|
||||||
word.tail = <Lexeme*>self.new_lexeme(token_string, lex)
|
|
||||||
word = word.tail
|
|
||||||
|
|
||||||
def load_clusters(self):
|
def load_clusters(self):
|
||||||
cdef Lexeme* w
|
cdef Lexeme* w
|
||||||
|
@ -209,24 +181,4 @@ cdef class Language:
|
||||||
# the first 4 bits. See redshift._parse_features.pyx
|
# the first 4 bits. See redshift._parse_features.pyx
|
||||||
cluster = int(cluster_str[::-1], 2)
|
cluster = int(cluster_str[::-1], 2)
|
||||||
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
|
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
|
||||||
word = self.new_lexeme(token_string, token_string)
|
self.new_lexeme(token_string)
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint _is_whitespace(unsigned char c) nogil:
|
|
||||||
if c == b' ':
|
|
||||||
return True
|
|
||||||
elif c == b'\n':
|
|
||||||
return True
|
|
||||||
elif c == b'\t':
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
cpdef vector[size_t] expand_chunk(size_t addr) except *:
|
|
||||||
cdef vector[size_t] tokens = vector[size_t]()
|
|
||||||
word = <Lexeme*>addr
|
|
||||||
while word != NULL:
|
|
||||||
tokens.push_back(<size_t>word)
|
|
||||||
word = word.tail
|
|
||||||
return tokens
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from spacy.spacy cimport Lexeme_addr
|
from spacy.spacy cimport Lexeme_addr
|
||||||
|
from spacy.lexeme cimport Lexeme
|
||||||
|
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
|
|
|
@ -32,13 +32,12 @@ def read_tokenization(lang):
|
||||||
continue
|
continue
|
||||||
pieces = line.split()
|
pieces = line.split()
|
||||||
chunk = pieces.pop(0)
|
chunk = pieces.pop(0)
|
||||||
lex = pieces.pop(0)
|
|
||||||
assert chunk not in seen, chunk
|
assert chunk not in seen, chunk
|
||||||
seen.add(chunk)
|
seen.add(chunk)
|
||||||
entries.append((chunk, lex, pieces))
|
entries.append((chunk, list(pieces)))
|
||||||
if chunk[0].isalpha() and chunk[0].islower():
|
if chunk[0].isalpha() and chunk[0].islower():
|
||||||
chunk = chunk[0].title() + chunk[1:]
|
chunk = chunk[0].title() + chunk[1:]
|
||||||
lex = lex[0].title() + lex[1:]
|
pieces[0] = pieces[0][0].title() + pieces[0][1:]
|
||||||
seen.add(chunk)
|
seen.add(chunk)
|
||||||
entries.append((chunk, lex, pieces))
|
entries.append((chunk, pieces))
|
||||||
return entries
|
return entries
|
||||||
|
|
|
@ -1,44 +1,43 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.spacy import expand_chunk
|
from spacy.en import tokenize, lookup, unhash
|
||||||
from spacy.en import lookup, unhash
|
|
||||||
|
|
||||||
from spacy import lex_of
|
from spacy import lex_of
|
||||||
|
|
||||||
|
|
||||||
def test_possess():
|
def test_possess():
|
||||||
tokens = expand_chunk(lookup("Mike's"))
|
tokens = tokenize("Mike's")
|
||||||
assert len(tokens) == 2
|
|
||||||
assert unhash(lex_of(tokens[0])) == "Mike"
|
assert unhash(lex_of(tokens[0])) == "Mike"
|
||||||
assert unhash(lex_of(tokens[1])) == "'s"
|
assert unhash(lex_of(tokens[1])) == "'s"
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_apostrophe():
|
def test_apostrophe():
|
||||||
tokens = expand_chunk(lookup("schools'"))
|
tokens = tokenize("schools'")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(lex_of(tokens[1])) == "'"
|
assert unhash(lex_of(tokens[1])) == "'"
|
||||||
assert unhash(lex_of(tokens[0])) == "schools"
|
assert unhash(lex_of(tokens[0])) == "schools"
|
||||||
|
|
||||||
|
|
||||||
def test_LL():
|
def test_LL():
|
||||||
tokens = expand_chunk(lookup("we'll"))
|
tokens = tokenize("we'll")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(lex_of(tokens[1])) == "will"
|
assert unhash(lex_of(tokens[1])) == "will"
|
||||||
assert unhash(lex_of(tokens[0])) == "we"
|
assert unhash(lex_of(tokens[0])) == "we"
|
||||||
|
|
||||||
|
|
||||||
def test_aint():
|
def test_aint():
|
||||||
tokens = expand_chunk(lookup("ain't"))
|
tokens = tokenize("ain't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(lex_of(tokens[0])) == "are"
|
assert unhash(lex_of(tokens[0])) == "are"
|
||||||
assert unhash(lex_of(tokens[1])) == "not"
|
assert unhash(lex_of(tokens[1])) == "not"
|
||||||
|
|
||||||
|
|
||||||
def test_capitalized():
|
def test_capitalized():
|
||||||
tokens = expand_chunk(lookup("can't"))
|
tokens = tokenize("can't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
tokens = expand_chunk(lookup("Can't"))
|
tokens = tokenize("Can't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
tokens = expand_chunk(lookup("Ain't"))
|
tokens = tokenize("Ain't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(lex_of(tokens[0])) == "Are"
|
assert unhash(lex_of(tokens[0])) == "Are"
|
||||||
|
|
|
@ -5,7 +5,7 @@ import pytest
|
||||||
from spacy import en
|
from spacy import en
|
||||||
from spacy.lexeme import lex_of
|
from spacy.lexeme import lex_of
|
||||||
|
|
||||||
from spacy import SIC, LEX, NORM, SHAPE, LAST3
|
from spacy import LEX, NORM, SHAPE, LAST3
|
||||||
|
|
||||||
|
|
||||||
def test_group_by_lex():
|
def test_group_by_lex():
|
||||||
|
|
|
@ -4,7 +4,7 @@ import pytest
|
||||||
|
|
||||||
from spacy.en import lookup, unhash
|
from spacy.en import lookup, unhash
|
||||||
|
|
||||||
from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of, length_of
|
from spacy.lexeme import lex_of, norm_of, shape_of, first_of, length_of
|
||||||
from spacy.lexeme import shape_of
|
from spacy.lexeme import shape_of
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy import lex_of
|
from spacy import lex_of
|
||||||
from spacy.spacy import expand_chunk
|
|
||||||
from spacy.en import lookup
|
from spacy.en import lookup
|
||||||
|
from spacy.en import tokenize
|
||||||
from spacy.en import unhash
|
from spacy.en import unhash
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -17,8 +17,7 @@ def test_close(close_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for p in close_puncts:
|
for p in close_puncts:
|
||||||
string = word_str + p
|
string = word_str + p
|
||||||
token = lookup(string)
|
tokens = tokenize(string)
|
||||||
tokens = expand_chunk(token)
|
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(lex_of(tokens[1])) == p
|
assert unhash(lex_of(tokens[1])) == p
|
||||||
assert unhash(lex_of(tokens[0])) == word_str
|
assert unhash(lex_of(tokens[0])) == word_str
|
||||||
|
@ -28,9 +27,7 @@ def test_two_different_close(close_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for p in close_puncts:
|
for p in close_puncts:
|
||||||
string = word_str + p + "'"
|
string = word_str + p + "'"
|
||||||
token = lookup(string)
|
tokens = tokenize(string)
|
||||||
assert unhash(lex_of(token)) == word_str
|
|
||||||
tokens = expand_chunk(token)
|
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert unhash(lex_of(tokens[0])) == word_str
|
assert unhash(lex_of(tokens[0])) == word_str
|
||||||
assert unhash(lex_of(tokens[1])) == p
|
assert unhash(lex_of(tokens[1])) == p
|
||||||
|
@ -41,7 +38,7 @@ def test_three_same_close(close_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for p in close_puncts:
|
for p in close_puncts:
|
||||||
string = word_str + p + p + p
|
string = word_str + p + p + p
|
||||||
tokens = expand_chunk(lookup(string))
|
tokens = tokenize(string)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
assert unhash(lex_of(tokens[0])) == word_str
|
assert unhash(lex_of(tokens[0])) == word_str
|
||||||
assert unhash(lex_of(tokens[1])) == p
|
assert unhash(lex_of(tokens[1])) == p
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy import lex_of
|
from spacy import lex_of
|
||||||
from spacy.spacy import expand_chunk
|
|
||||||
from spacy.en import lookup
|
from spacy.en import lookup
|
||||||
|
from spacy.en import tokenize
|
||||||
from spacy.en import unhash
|
from spacy.en import unhash
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -17,9 +17,7 @@ def test_open(open_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for p in open_puncts:
|
for p in open_puncts:
|
||||||
string = p + word_str
|
string = p + word_str
|
||||||
token = lookup(string)
|
tokens = tokenize(string)
|
||||||
assert unhash(lex_of(token)) == p
|
|
||||||
tokens = expand_chunk(token)
|
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(lex_of(tokens[0])) == p
|
assert unhash(lex_of(tokens[0])) == p
|
||||||
assert unhash(lex_of(tokens[1])) == word_str
|
assert unhash(lex_of(tokens[1])) == word_str
|
||||||
|
@ -29,9 +27,7 @@ def test_two_different_open(open_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for p in open_puncts:
|
for p in open_puncts:
|
||||||
string = p + "`" + word_str
|
string = p + "`" + word_str
|
||||||
token = lookup(string)
|
tokens = tokenize(string)
|
||||||
assert unhash(lex_of(token)) == p
|
|
||||||
tokens = expand_chunk(token)
|
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert unhash(lex_of(tokens[0])) == p
|
assert unhash(lex_of(tokens[0])) == p
|
||||||
assert unhash(lex_of(tokens[1])) == "`"
|
assert unhash(lex_of(tokens[1])) == "`"
|
||||||
|
@ -42,9 +38,7 @@ def test_three_same_open(open_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for p in open_puncts:
|
for p in open_puncts:
|
||||||
string = p + p + p + word_str
|
string = p + p + p + word_str
|
||||||
token = lookup(string)
|
tokens = tokenize(string)
|
||||||
assert unhash(lex_of(token)) == p
|
|
||||||
tokens = expand_chunk(token)
|
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
assert unhash(lex_of(tokens[0])) == p
|
assert unhash(lex_of(tokens[0])) == p
|
||||||
assert unhash(lex_of(tokens[3])) == word_str
|
assert unhash(lex_of(tokens[3])) == word_str
|
||||||
|
@ -52,6 +46,6 @@ def test_three_same_open(open_puncts):
|
||||||
|
|
||||||
def test_open_appostrophe():
|
def test_open_appostrophe():
|
||||||
string = "'The"
|
string = "'The"
|
||||||
tokens = expand_chunk(lookup(string))
|
tokens = tokenize(string)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(lex_of(tokens[0])) == "'"
|
assert unhash(lex_of(tokens[0])) == "'"
|
||||||
|
|
|
@ -5,7 +5,7 @@ def test_load_en():
|
||||||
rules = util.read_tokenization('en')
|
rules = util.read_tokenization('en')
|
||||||
assert len(rules) != 0
|
assert len(rules) != 0
|
||||||
aint = [rule for rule in rules if rule[0] == "ain't"][0]
|
aint = [rule for rule in rules if rule[0] == "ain't"][0]
|
||||||
chunk, lex, pieces = aint
|
chunk, pieces = aint
|
||||||
assert chunk == "ain't"
|
assert chunk == "ain't"
|
||||||
assert lex == "are"
|
assert pieces[0] == "are"
|
||||||
assert pieces == ["not"]
|
assert pieces[1] == "not"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy import lex_of, sic_of
|
from spacy import lex_of
|
||||||
from spacy.spacy import expand_chunk
|
from spacy.en import tokenize
|
||||||
from spacy.en import lookup
|
from spacy.en import lookup
|
||||||
from spacy.en import unhash
|
from spacy.en import unhash
|
||||||
|
|
||||||
|
@ -17,19 +17,18 @@ def test_token(paired_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for open_, close_ in paired_puncts:
|
for open_, close_ in paired_puncts:
|
||||||
string = open_ + word_str + close_
|
string = open_ + word_str + close_
|
||||||
tokens = expand_chunk(lookup(string))
|
tokens = tokenize(string)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert unhash(lex_of(tokens[0])) == open_
|
assert unhash(lex_of(tokens[0])) == open_
|
||||||
assert unhash(lex_of(tokens[1])) == word_str
|
assert unhash(lex_of(tokens[1])) == word_str
|
||||||
assert unhash(lex_of(tokens[2])) == close_
|
assert unhash(lex_of(tokens[2])) == close_
|
||||||
assert unhash(sic_of(tokens[0])) == string
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_different(paired_puncts):
|
def test_two_different(paired_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for open_, close_ in paired_puncts:
|
for open_, close_ in paired_puncts:
|
||||||
string = "`" + open_ + word_str + close_ + "'"
|
string = "`" + open_ + word_str + close_ + "'"
|
||||||
tokens = expand_chunk(lookup(string))
|
tokens = tokenize(string)
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert unhash(lex_of(tokens[0])) == "`"
|
assert unhash(lex_of(tokens[0])) == "`"
|
||||||
assert unhash(lex_of(tokens[1])) == open_
|
assert unhash(lex_of(tokens[1])) == open_
|
||||||
|
|
|
@ -19,15 +19,12 @@ def test_two_words():
|
||||||
|
|
||||||
|
|
||||||
def test_punct():
|
def test_punct():
|
||||||
lex_ids = tokenize('hello, possums.')
|
tokens = tokenize('hello, possums.')
|
||||||
assert len(lex_ids) == 4
|
assert len(tokens) == 4
|
||||||
assert lex_ids[0] != lookup('hello')
|
assert lex_of(tokens[0]) == lex_of(lookup('hello'))
|
||||||
assert lex_of(lex_ids[0]) == lex_of(lookup('hello'))
|
assert lex_of(tokens[1]) == lex_of(lookup(','))
|
||||||
assert lex_ids[2] == lookup('possums.')
|
assert lex_of(tokens[2]) == lex_of(lookup('possums'))
|
||||||
assert lex_of(lex_ids[2]) == lex_of(lookup('possums.'))
|
assert lex_of(tokens[1]) != lex_of(lookup('hello'))
|
||||||
assert lex_of(lex_ids[2]) == lex_of(lookup('possums'))
|
|
||||||
assert lex_of(lex_ids[1]) != lex_of(lookup('hello'))
|
|
||||||
assert lex_ids[0] != lookup('hello.')
|
|
||||||
|
|
||||||
|
|
||||||
def test_digits():
|
def test_digits():
|
||||||
|
|
Loading…
Reference in New Issue
Block a user