* Broken version being refactored for docs

This commit is contained in:
Matthew Honnibal 2014-08-20 13:39:39 +02:00
parent 5fddb8d165
commit a78ad4152d
8 changed files with 196 additions and 224 deletions

View File

@ -10,6 +10,7 @@ from spacy.tokens cimport Tokens
cdef class English(spacy.Language): cdef class English(spacy.Language):
cdef int find_split(self, unicode word) cdef int find_split(self, unicode word)
cdef int set_orth(self, unicode word, Lexeme* lex) except -1
cdef English EN cdef English EN

View File

@ -1,7 +1,8 @@
# cython: profile=True # cython: profile=True
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index, # cython: embedsignature=True
so that strings can be retrieved from hashes. Use 64-bit hash values and '''Tokenize English text, allowing some differences from the Penn Treebank
boldly assume no collisions. tokenization, e.g. for email addresses, URLs, etc. Use en_ptb if full PTB
compatibility is the priority.
''' '''
from __future__ import unicode_literals from __future__ import unicode_literals
@ -9,14 +10,17 @@ from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from libcpp.vector cimport vector from libcpp.vector cimport vector
from spacy.string_tools cimport substr
from . import util
cimport spacy cimport spacy
from spacy.orthography.latin cimport *
cdef class English(spacy.Language): cdef class English(spacy.Language):
cdef int set_orth(self, unicode word, Lexeme* lex) except -1:
pass
cdef int find_split(self, unicode word): cdef int find_split(self, unicode word):
cdef size_t length = len(word) cdef size_t length = len(word)
cdef int i = 0 cdef int i = 0
@ -26,17 +30,17 @@ cdef class English(spacy.Language):
if word.endswith("'s") and length >= 3: if word.endswith("'s") and length >= 3:
return length - 2 return length - 2
# Leading punctuation # Leading punctuation
if is_punct(word, 0, length): if check_punct(word, 0, length):
return 1 return 1
elif length >= 1: elif length >= 1:
# Split off all trailing punctuation characters # Split off all trailing punctuation characters
i = 0 i = 0
while i < length and not is_punct(word, i, length): while i < length and not check_punct(word, i, length):
i += 1 i += 1
return i return i
cdef bint is_punct(unicode word, size_t i, size_t length): cdef bint check_punct(unicode word, size_t i, size_t length):
# Don't count appostrophes as punct if the next char is a letter # Don't count appostrophes as punct if the next char is a letter
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
return i == 0 return i == 0
@ -55,14 +59,52 @@ EN = English('en')
cpdef Tokens tokenize(unicode string): cpdef Tokens tokenize(unicode string):
"""Tokenize a string.
Wraps EN.tokenize, where EN is an instance of the class English. The global
variable manages the vocabulary, and memoizes tokenization rules.
Args:
string (unicode): The string to be split. Must be unicode, not bytes.
Returns:
tokens (Tokens): A Tokens instance, managing a vector of pointers to
Lexeme structs. The Tokens instance supports sequence interfaces,
but also offers a range of sequence-level operations, which are computed
efficiently in Cython-space.
"""
return EN.tokenize(string) return EN.tokenize(string)
cpdef Lexeme_addr lookup(unicode string) except 0: cpdef Lexeme_addr lookup(unicode string) except 0:
"""Retrieve (or create) a Lexeme for a string.
Returns a Lexeme ID, which can be used via the accessor
methods in spacy.lexeme
Args:
string (unicode): The string to be looked up. Must be unicode, not bytes.
Returns:
LexemeID (size_t): An unsigned integer that allows the Lexeme to be retrieved.
The LexemeID is really a memory address, making dereferencing it essentially
free.
"""
return <Lexeme_addr>EN.lookup(string) return <Lexeme_addr>EN.lookup(string)
cpdef unicode unhash(StringHash hash_value): cpdef unicode unhash(StringHash hash_value):
"""Retrieve a string from a hash value. Mostly used for testing.
In general you should avoid computing with strings, as they are slower than
the intended ID-based usage. However, strings can be recovered if necessary,
although no control is taken for hash collisions.
Args:
hash_value (uint32_t): The hash of a string, returned by Python's hash()
function.
Returns:
string (unicode): A unicode string that hashes to the hash_value.
"""
return EN.unhash(hash_value) return EN.unhash(hash_value)

View File

@ -1,83 +1,34 @@
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
# Put these above import to avoid circular import problem
ctypedef int ClusterID ctypedef int ClusterID
ctypedef uint32_t StringHash ctypedef uint32_t StringHash
ctypedef size_t Lexeme_addr ctypedef size_t LexID
ctypedef char Bits8 ctypedef char OrthFlags
ctypedef uint64_t Bits64 ctypedef char DistFlags
ctypedef uint64_t TagFlags
cdef enum OrthFlag:
IS_ALPHA
IS_DIGIT
IS_PUNCT
IS_WHITE
IS_LOWER
IS_UPPER
IS_TITLE
IS_ASCII
cdef enum DistFlag:
OFT_UPPER
OFT_TITLE
DIST_FLAG3
DIST_FLAG4
DIST_FLAG5
DIST_FLAG6
DIST_FLAG7
DIST_FLAG8
cdef struct Orthography:
StringHash shape
StringHash norm
StringHash last3
Bits8 flags
cdef struct Distribution:
double prob
ClusterID cluster
Bits64 tagdict
Bits8 flags
cdef struct Lexeme: cdef struct Lexeme:
StringHash lex
char* string char* string
size_t length size_t length
StringHash lex double prob
Orthography orth # Extra orthographic views ClusterID cluster
Distribution dist # Distribution info TagFlags possible_tags
DistFlags dist_flags
OrthFlags orth_flags
StringHash* string_views
cdef Lexeme BLANK_WORD = Lexeme(NULL, 0, 0, cpdef char first_of(LexID lex_id) except 0
Orthography(0, 0, 0, 0), cpdef size_t length_of(LexID lex_id) except 0
Distribution(0.0, 0, 0, 0) cpdef double prob_of(LexID lex_id) except 0
) cpdef ClusterID cluster_of(LexID lex_id) except 0
cpdef bint check_tag_flag(LexID lex, TagFlags flag) except *
cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *
cdef enum StringAttr: cpdef StringHash view_of(LexID lex_id, size_t view) except 0
LEX
NORM
SHAPE
LAST3
LENGTH
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
cpdef StringHash lex_of(size_t lex_id) except 0
cpdef StringHash norm_of(size_t lex_id) except 0
cpdef StringHash shape_of(size_t lex_id) except 0
cpdef StringHash last3_of(size_t lex_id) except 0
cpdef size_t length_of(size_t lex_id) except *
cpdef double prob_of(size_t lex_id) except 0
cpdef ClusterID cluster_of(size_t lex_id) except 0
cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *
cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *

View File

@ -1,32 +1,32 @@
# cython: profile=True # cython: profile=True
# cython: embedsignature=True
'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*. '''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
Mostly useful from Python-space. From Cython-space, you can just cast to Mostly useful from Python-space. From Cython-space, you can just cast to
Lexeme* yourself. Lexeme* yourself.
''' '''
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.string_tools cimport substr
from libc.stdlib cimport malloc, calloc, free from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash from spacy.spacy cimport StringHash
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0: cpdef int set_flags(LexID lex_id, object active_flags) except *:
if attr == LEX: """Set orthographic bit flags for a Lexeme.
return lex_of(lex_id)
elif attr == NORM: Args:
return norm_of(lex_id) lex_id (LexemeID): A reference ID for a Lexeme.
elif attr == SHAPE: active_flags: A sequence of bits to set as True.
return shape_of(lex_id) """
elif attr == LAST3: cdef size_t flag
return last3_of(lex_id) cdef Lexeme* w = <Lexeme*>lex_id
elif attr == LENGTH: for flag in active_flags:
return length_of(lex_id) w.orth_flags |= 1 << flag
else:
raise StandardError
cpdef StringHash view_of(LexID lex_id, size_t view) except 0:
return (<Lexeme*>lex_id).string_views[view]
cpdef StringHash lex_of(size_t lex_id) except 0: cpdef StringHash lex_of(size_t lex_id) except 0:
@ -37,42 +37,14 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
delimited tokens split off. The other fields refer to properties of the delimited tokens split off. The other fields refer to properties of the
string that the lex field stores a hash of, except sic and tail. string that the lex field stores a hash of, except sic and tail.
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')] >>> from spacy import en
>>> [en.unhash(lex_of(lex_id) for lex_id in en.tokenize(u'Hi! world')]
[u'Hi', u'!', u'world'] [u'Hi', u'!', u'world']
''' '''
return (<Lexeme*>lex_id).lex return (<Lexeme*>lex_id).lex
cpdef StringHash norm_of(size_t lex_id) except 0: cpdef ClusterID cluster_of(LexID lex_id) except 0:
'''Access the `lex' field of the Lexeme pointed to by lex_id.
The lex field is the hash of the string you would expect to get back from
a standard tokenizer, i.e. the word with punctuation and other non-whitespace
delimited tokens split off. The other fields refer to properties of the
string that the lex field stores a hash of, except sic and tail.
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
[u'Hi', u'!', u'world']
'''
return (<Lexeme*>lex_id).orth.norm
cpdef StringHash shape_of(size_t lex_id) except 0:
return (<Lexeme*>lex_id).orth.shape
cpdef StringHash last3_of(size_t lex_id) except 0:
'''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
the hash of the last three characters of the word:
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
[u'llo', u'!']
'''
return (<Lexeme*>lex_id).orth.last3
cpdef ClusterID cluster_of(size_t lex_id) except 0:
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
gives an integer representation of the cluster ID of the word, gives an integer representation of the cluster ID of the word,
which should be understood as a binary address: which should be understood as a binary address:
@ -88,10 +60,10 @@ cpdef ClusterID cluster_of(size_t lex_id) except 0:
while "dapple" is totally different. On the other hand, "scalable" receives while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like. the same cluster ID as "pineapple", which is not what we'd like.
''' '''
return (<Lexeme*>lex_id).dist.cluster return (<Lexeme*>lex_id).cluster
cpdef Py_UNICODE first_of(size_t lex_id): cpdef char first_of(size_t lex_id) except 0:
'''Access the `first' field of the Lexeme pointed to by lex_id, which '''Access the `first' field of the Lexeme pointed to by lex_id, which
stores the first character of the lex string of the word. stores the first character of the lex string of the word.
@ -99,10 +71,10 @@ cpdef Py_UNICODE first_of(size_t lex_id):
>>> unhash(first_of(lex_id)) >>> unhash(first_of(lex_id))
u'H' u'H'
''' '''
return (<Lexeme*>lex_id).orth.first return (<Lexeme*>lex_id).string[0]
cpdef size_t length_of(size_t lex_id) except *: cpdef size_t length_of(size_t lex_id) except 0:
'''Access the `length' field of the Lexeme pointed to by lex_id, which stores '''Access the `length' field of the Lexeme pointed to by lex_id, which stores
the length of the string hashed by lex_of.''' the length of the string hashed by lex_of.'''
cdef Lexeme* word = <Lexeme*>lex_id cdef Lexeme* word = <Lexeme*>lex_id
@ -119,8 +91,10 @@ cpdef double prob_of(size_t lex_id) except 0:
>>> prob_of(lookup(u'world')) >>> prob_of(lookup(u'world'))
-20.10340371976182 -20.10340371976182
''' '''
return (<Lexeme*>lex_id).dist.prob return (<Lexeme*>lex_id).prob
DEF OFT_UPPER = 1
DEF OFT_TITLE = 2
cpdef bint is_oft_upper(size_t lex_id): cpdef bint is_oft_upper(size_t lex_id):
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
@ -134,7 +108,7 @@ cpdef bint is_oft_upper(size_t lex_id):
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
True True
''' '''
return (<Lexeme*>lex_id).dist.flags & OFT_UPPER return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)
cpdef bint is_oft_title(size_t lex_id): cpdef bint is_oft_title(size_t lex_id):
@ -149,11 +123,15 @@ cpdef bint is_oft_title(size_t lex_id):
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
True True
''' '''
return (<Lexeme*>lex_id).dist.flags & OFT_TITLE return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)
cpdef bint check_orth_flag(size_t lex_id, OrthFlag flag) except *: cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
return (<Lexeme*>lex_id).orth.flags & (1 << flag) return (<Lexeme*>lex_id).orth_flags & (1 << flag)
cpdef bint check_dist_flag(size_t lex_id, DistFlag flag) except *: cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
return (<Lexeme*>lex_id).dist.flags & (1 << flag) return (<Lexeme*>lex_id).dist_flags & (1 << flag)
cpdef bint check_tag_flag(LexID lex_id, TagFlags flag) except *:
return (<Lexeme*>lex_id).possible_tags & (1 << flag)

View File

@ -19,8 +19,6 @@ ctypedef int ClusterID
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Distribution
from spacy.lexeme cimport Orthography
cdef class Language: cdef class Language:
@ -29,7 +27,7 @@ cdef class Language:
cdef dense_hash_map[StringHash, size_t] vocab cdef dense_hash_map[StringHash, size_t] vocab
cdef dict bacov cdef dict bacov
cdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cdef Lexeme* lookup(self, unicode string) except NULL cdef Lexeme* lookup(self, unicode string) except NULL
cdef Lexeme** lookup_chunk(self, unicode chunk) except NULL cdef Lexeme** lookup_chunk(self, unicode chunk) except NULL
@ -37,7 +35,8 @@ cdef class Language:
cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
cdef Lexeme* new_lexeme(self, unicode lex) except NULL cdef Lexeme* new_lexeme(self, unicode lex) except NULL
cdef unicode unhash(self, StringHash hashed) cpdef unicode unhash(self, StringHash hashed)
cpdef list find_substrings(self, unicode word) cpdef list find_substrings(self, unicode chunk)
cdef int find_split(self, unicode word) cdef int find_split(self, unicode word)
cdef int set_orth(self, unicode string, Lexeme* word)

View File

@ -1,4 +1,13 @@
# cython: profile=True # cython: profile=True
# cython: embedsignature=True
"""Common classes and utilities across languages.
Provides the main implementation for the spacy tokenizer. Specific languages
subclass the Language class, over-writing the tokenization rules as necessary.
Special-case tokenization rules are read from data/<lang>/tokenization .
"""
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
@ -6,54 +15,13 @@ from libcpp.pair cimport pair
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport BLANK_WORD from spacy.lexeme cimport LexID
from spacy.string_tools cimport substr
from . import util from . import util
from os import path from os import path
DIST_FLAGS = {}
TAGS = {} TAGS = {}
DIST_FLAGS = {}
def get_normalized(unicode lex):
if lex.isalpha() and lex.islower():
return lex
else:
return get_word_shape(lex)
def get_word_shape(unicode lex):
cdef size_t length = len(lex)
shape = ""
last = ""
shape_char = ""
seq = 0
for c in lex:
if c.isalpha():
if c.isupper():
shape_char = "X"
else:
shape_char = "x"
elif c.isdigit():
shape_char = "d"
else:
shape_char = c
if shape_char == last:
seq += 1
else:
seq = 0
last = shape_char
if seq < 3:
shape += shape_char
assert shape
return shape
def set_orth_flags(lex):
return 0
cdef class Language: cdef class Language:
def __cinit__(self, name): def __cinit__(self, name):
@ -64,9 +32,19 @@ cdef class Language:
self.chunks.set_empty_key(0) self.chunks.set_empty_key(0)
self.vocab.set_empty_key(0) self.vocab.set_empty_key(0)
self.load_tokenization(util.read_tokenization(name)) self.load_tokenization(util.read_tokenization(name))
#self.load_dist_info(util.read_dist_info(name)) self.load_dist_info(util.read_dist_info(name))
cdef Tokens tokenize(self, unicode string): cpdef Tokens tokenize(self, unicode string):
"""Tokenize.
Split the string into tokens.
Args:
string (unicode): The string to split.
Returns:
tokens (Tokens): A Tokens object.
"""
cdef Lexeme** chunk cdef Lexeme** chunk
cdef Tokens tokens = Tokens(self) cdef Tokens tokens = Tokens(self)
cdef size_t length = len(string) cdef size_t length = len(string)
@ -85,8 +63,7 @@ cdef class Language:
return tokens return tokens
cdef Lexeme* lookup(self, unicode string) except NULL: cdef Lexeme* lookup(self, unicode string) except NULL:
if len(string) == 0: assert len(string) != 0
return &BLANK_WORD
cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)] cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
if word == NULL: if word == NULL:
word = self.new_lexeme(string) word = self.new_lexeme(string)
@ -113,56 +90,79 @@ cdef class Language:
cdef bytes byte_string = string.encode('utf8') cdef bytes byte_string = string.encode('utf8')
word.string = <char*>byte_string word.string = <char*>byte_string
word.length = len(byte_string) word.length = len(byte_string)
word.orth.flags = set_orth_flags(string) self.set_orth(string, word)
cdef unicode norm = get_normalized(string)
cdef unicode shape = get_word_shape(string)
cdef unicode last3 = string[-3:]
word.lex = hash(string)
word.orth.norm = hash(norm)
word.orth.shape = hash(shape)
word.orth.last3 = hash(last3)
self.bacov[word.lex] = string
self.bacov[word.orth.norm] = norm
self.bacov[word.orth.shape] = shape
self.bacov[word.orth.last3] = last3
self.vocab[hash(string)] = <size_t>word word.lex = hash(string)
self.bacov[word.lex] = string
self.vocab[word.lex] = <LexID>word
return word return word
cdef unicode unhash(self, StringHash hash_value): cpdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.''' '''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value] return self.bacov[hash_value]
cpdef list find_substrings(self, unicode word): cpdef list find_substrings(self, unicode chunk):
"""Find how to split a chunk into substrings.
This method calls find_split repeatedly. Most languages will want to
override find_split, but it may be useful to override this instead.
Args:
chunk (unicode): The string to be split, e.g. u"Mike's!"
Returns:
substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
"""
substrings = [] substrings = []
while word: while chunk:
split = self.find_split(word) split = self.find_split(chunk)
if split == 0: if split == 0:
substrings.append(word) substrings.append(chunk)
break break
substrings.append(word[:split]) substrings.append(chunk[:split])
word = word[split:] chunk = chunk[split:]
return substrings return substrings
cdef int find_split(self, unicode word): cdef int find_split(self, unicode word):
return len(word) return len(word)
def load_tokenization(self, token_rules=None): cdef int set_orth(self, unicode string, Lexeme* word):
pass
def load_tokenization(self, token_rules):
'''Load special-case tokenization rules.
Loads special-case tokenization rules into the Language.chunk cache,
read from data/<lang>/tokenization . The special cases are loaded before
any language data is tokenized, giving these priority. For instance,
the English tokenization rules map "ain't" to ["are", "not"].
Args:
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
a string and tokens is a list of strings.
'''
for chunk, tokens in token_rules: for chunk, tokens in token_rules:
self.new_chunk(chunk, tokens) self.new_chunk(chunk, tokens)
def load_dist_info(self, dist_info): def load_dist_info(self, dist_info):
'''Load distributional information for the known lexemes of the language.
The distributional information is read from data/<lang>/dist_info.json .
It contains information like the (smoothed) unigram log probability of
the word, how often the word is found upper-cased, how often the word
is found title-cased, etc.
'''
cdef unicode string cdef unicode string
cdef dict word_dist cdef dict word_dist
cdef Lexeme* w cdef Lexeme* w
for string, word_dist in dist_info.items(): for string, word_dist in dist_info.items():
w = self.lookup(string) w = self.lookup(string)
w.dist.prob = word_dist.prob w.prob = word_dist.prob
w.dist.cluster = word_dist.cluster w.cluster = word_dist.cluster
for flag in word_dist.flags: for flag in word_dist.flags:
w.dist.flags |= DIST_FLAGS[flag] w.dist_flags |= DIST_FLAGS[flag]
for tag in word_dist.tagdict: for tag in word_dist.tagdict:
w.dist.tagdict |= TAGS[tag] w.possible_tags |= TAGS[tag]
cdef inline bint _is_whitespace(Py_UNICODE c) nogil: cdef inline bint _is_whitespace(Py_UNICODE c) nogil:

View File

@ -4,7 +4,6 @@ from spacy.lexeme cimport Lexeme
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from spacy.spacy cimport Language from spacy.spacy cimport Language
from spacy.lexeme cimport StringAttr
cdef class Tokens: cdef class Tokens:
@ -15,5 +14,5 @@ cdef class Tokens:
cpdef int append(self, Lexeme_addr token) cpdef int append(self, Lexeme_addr token)
cpdef int extend(self, Tokens other) except -1 cpdef int extend(self, Tokens other) except -1
cpdef object group_by(self, StringAttr attr) cpdef object group_by(self, size_t attr)
cpdef dict count_by(self, StringAttr attr) cpdef dict count_by(self, size_t attr)

View File

@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of #from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
from spacy.spacy cimport StringHash from spacy.spacy cimport StringHash
@ -37,7 +37,7 @@ cdef class Tokens:
for el in other: for el in other:
self.append(el) self.append(el)
cpdef object group_by(self, StringAttr attr): cpdef object group_by(self, size_t attr):
'''Group tokens that share the property attr into Tokens instances, and '''Group tokens that share the property attr into Tokens instances, and
return a list of them. Returns a tuple of three lists: return a list of them. Returns a tuple of three lists:
@ -66,7 +66,8 @@ cdef class Tokens:
cdef StringHash key cdef StringHash key
cdef Lexeme_addr t cdef Lexeme_addr t
for t in self.vctr[0]: for t in self.vctr[0]:
key = attr_of(t, attr) #key = attr_of(t, attr)
key = 0
if key in indices: if key in indices:
groups[indices[key]].append(t) groups[indices[key]].append(t)
else: else:
@ -77,12 +78,13 @@ cdef class Tokens:
groups[-1].append(t) groups[-1].append(t)
return names, hashes, groups return names, hashes, groups
cpdef dict count_by(self, StringAttr attr): cpdef dict count_by(self, size_t attr):
counts = {} counts = {}
cdef Lexeme_addr t cdef Lexeme_addr t
cdef StringHash key cdef StringHash key
for t in self.vctr[0]: for t in self.vctr[0]:
key = attr_of(t, attr) #key = attr_of(t, attr)
key = 0
if key not in counts: if key not in counts:
counts[key] = 0 counts[key] = 0
counts[key] += 1 counts[key] += 1