mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Refactor for string view features. Working on setting up flags and enums.
This commit is contained in:
parent
9fd085bf90
commit
057c21969b
|
@ -1,8 +1,8 @@
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from spacy.spacy cimport StringHash
|
from spacy.spacy cimport StringHash
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.spacy cimport Lexeme
|
||||||
from spacy.lexeme cimport Lexeme_addr
|
from spacy.spacy cimport Lexeme_addr
|
||||||
|
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
from spacy.tokens cimport Tokens
|
from spacy.tokens cimport Tokens
|
||||||
|
|
|
@ -9,7 +9,6 @@ from libc.stdlib cimport malloc, calloc, free
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
|
||||||
from spacy.string_tools cimport substr
|
from spacy.string_tools cimport substr
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
|
|
|
@ -2,8 +2,8 @@ from libcpp.vector cimport vector
|
||||||
|
|
||||||
from spacy.spacy cimport StringHash
|
from spacy.spacy cimport StringHash
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.spacy cimport Lexeme
|
||||||
from spacy.lexeme cimport Lexeme_addr
|
from spacy.spacy cimport Lexeme_addr
|
||||||
from spacy.tokens cimport Tokens
|
from spacy.tokens cimport Tokens
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,11 +4,11 @@ boldly assume no collisions.
|
||||||
'''
|
'''
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
from libc.stdlib cimport malloc, calloc, free
|
from libc.stdlib cimport malloc, calloc, free
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
|
||||||
from spacy.string_tools cimport substr
|
from spacy.string_tools cimport substr
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
from . import util
|
from . import util
|
||||||
|
|
|
@ -4,29 +4,48 @@ from libc.stdint cimport uint64_t
|
||||||
ctypedef int ClusterID
|
ctypedef int ClusterID
|
||||||
ctypedef uint64_t StringHash
|
ctypedef uint64_t StringHash
|
||||||
ctypedef size_t Lexeme_addr
|
ctypedef size_t Lexeme_addr
|
||||||
|
ctypedef char Bits8
|
||||||
|
ctypedef uint64_t Bits64
|
||||||
|
|
||||||
|
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct Orthography:
|
||||||
|
StringHash last3
|
||||||
|
StringHash shape
|
||||||
|
StringHash norm
|
||||||
|
|
||||||
|
Py_UNICODE first
|
||||||
|
Bits8 flags
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct Distribution:
|
||||||
|
double prob
|
||||||
|
ClusterID cluster
|
||||||
|
Bits64 tagdict
|
||||||
|
Bits8 flags
|
||||||
|
|
||||||
|
|
||||||
cdef struct Lexeme:
|
cdef struct Lexeme:
|
||||||
StringHash sic # Hash of the original string
|
StringHash sic # Hash of the original string
|
||||||
StringHash lex # Hash of the word, with punctuation and clitics split off
|
StringHash lex # Hash of the word, with punctuation and clitics split off
|
||||||
StringHash normed # Hash of the normalized version of lex
|
|
||||||
StringHash last3 # Last 3 characters of the token
|
|
||||||
Py_UNICODE first # First character of the token
|
|
||||||
|
|
||||||
double prob # What is the log probability of the lex value?
|
Distribution* dist # Distribution info, lazy loaded
|
||||||
ClusterID cluster # Brown cluster of the token
|
Orthography* orth # Extra orthographic views
|
||||||
|
|
||||||
bint oft_upper # Is the lowered version of the lex value often in all caps?
|
|
||||||
bint oft_title # Is the lowered version of the lex value often title-cased?
|
|
||||||
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
|
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
|
cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
|
||||||
|
|
||||||
cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
|
|
||||||
int split, size_t length)
|
cpdef StringHash lex_of(size_t lex_id) except 0
|
||||||
|
cpdef StringHash norm_of(size_t lex_id) except 0
|
||||||
|
#cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
|
||||||
|
# int split, size_t length)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
|
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
|
||||||
# has a conditional to pick out the correct item. This allows safe iteration
|
# has a conditional to pick out the correct item. This allows safe iteration
|
||||||
# over the Lexeme, via:
|
# over the Lexeme, via:
|
||||||
|
|
|
@ -11,49 +11,7 @@ from libc.stdlib cimport malloc, calloc, free
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
from spacy.spacy cimport StringHash
|
||||||
cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
|
|
||||||
int split, size_t length):
|
|
||||||
assert split <= length
|
|
||||||
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
|
||||||
|
|
||||||
word.first = <Py_UNICODE>(string[0] if string else 0)
|
|
||||||
word.sic = hashed
|
|
||||||
|
|
||||||
cdef unicode tail_string
|
|
||||||
cdef unicode lex
|
|
||||||
if split != 0 and split < length:
|
|
||||||
lex = substr(string, 0, split, length)
|
|
||||||
tail_string = substr(string, split, length, length)
|
|
||||||
else:
|
|
||||||
lex = string
|
|
||||||
tail_string = ''
|
|
||||||
assert lex
|
|
||||||
#cdef unicode normed = normalize_word_string(lex)
|
|
||||||
cdef unicode normed = '?'
|
|
||||||
cdef unicode last3 = substr(string, length - 3, length, length)
|
|
||||||
|
|
||||||
assert normed
|
|
||||||
assert len(normed)
|
|
||||||
|
|
||||||
word.lex = lang.hash_string(lex, len(lex))
|
|
||||||
word.normed = lang.hash_string(normed, len(normed))
|
|
||||||
word.last3 = lang.hash_string(last3, len(last3))
|
|
||||||
|
|
||||||
lang.bacov[word.lex] = lex
|
|
||||||
lang.bacov[word.normed] = normed
|
|
||||||
lang.bacov[word.last3] = last3
|
|
||||||
|
|
||||||
# These are loaded later
|
|
||||||
word.prob = 0
|
|
||||||
word.cluster = 0
|
|
||||||
word.oft_upper = False
|
|
||||||
word.oft_title = False
|
|
||||||
|
|
||||||
# Now recurse, and deal with the tail
|
|
||||||
if tail_string:
|
|
||||||
word.tail = <Lexeme*>lang.lookup(-1, tail_string, len(tail_string))
|
|
||||||
return word
|
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash sic_of(size_t lex_id) except 0:
|
cpdef StringHash sic_of(size_t lex_id) except 0:
|
||||||
|
@ -82,6 +40,20 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
|
||||||
return (<Lexeme*>lex_id).lex
|
return (<Lexeme*>lex_id).lex
|
||||||
|
|
||||||
|
|
||||||
|
cpdef StringHash norm_of(size_t lex_id) except 0:
|
||||||
|
'''Access the `lex' field of the Lexeme pointed to by lex_id.
|
||||||
|
|
||||||
|
The lex field is the hash of the string you would expect to get back from
|
||||||
|
a standard tokenizer, i.e. the word with punctuation and other non-whitespace
|
||||||
|
delimited tokens split off. The other fields refer to properties of the
|
||||||
|
string that the lex field stores a hash of, except sic and tail.
|
||||||
|
|
||||||
|
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
|
||||||
|
[u'Hi', u'!', u'world']
|
||||||
|
'''
|
||||||
|
return (<Lexeme*>lex_id).orth.norm
|
||||||
|
|
||||||
|
|
||||||
cpdef ClusterID cluster_of(size_t lex_id):
|
cpdef ClusterID cluster_of(size_t lex_id):
|
||||||
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
|
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
|
||||||
gives an integer representation of the cluster ID of the word,
|
gives an integer representation of the cluster ID of the word,
|
||||||
|
@ -98,7 +70,7 @@ cpdef ClusterID cluster_of(size_t lex_id):
|
||||||
while "dapple" is totally different. On the other hand, "scalable" receives
|
while "dapple" is totally different. On the other hand, "scalable" receives
|
||||||
the same cluster ID as "pineapple", which is not what we'd like.
|
the same cluster ID as "pineapple", which is not what we'd like.
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).cluster
|
return (<Lexeme*>lex_id).dist.cluster
|
||||||
|
|
||||||
|
|
||||||
cpdef Py_UNICODE first_of(size_t lex_id):
|
cpdef Py_UNICODE first_of(size_t lex_id):
|
||||||
|
@ -109,7 +81,7 @@ cpdef Py_UNICODE first_of(size_t lex_id):
|
||||||
>>> unhash(first_of(lex_id))
|
>>> unhash(first_of(lex_id))
|
||||||
u'H'
|
u'H'
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).first
|
return (<Lexeme*>lex_id).orth.first
|
||||||
|
|
||||||
|
|
||||||
cpdef double prob_of(size_t lex_id):
|
cpdef double prob_of(size_t lex_id):
|
||||||
|
@ -133,7 +105,8 @@ cpdef StringHash last3_of(size_t lex_id):
|
||||||
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
||||||
[u'llo', u'!']
|
[u'llo', u'!']
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).last3
|
cdef Lexeme* w = <Lexeme*>lex_id
|
||||||
|
return w.orth.last3 if w.orth != NULL else 0
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_oft_upper(size_t lex_id):
|
cpdef bint is_oft_upper(size_t lex_id):
|
||||||
|
@ -148,7 +121,12 @@ cpdef bint is_oft_upper(size_t lex_id):
|
||||||
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
||||||
True
|
True
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).oft_upper
|
return False
|
||||||
|
#cdef Lexeme* w = <Lexeme*>lex_id
|
||||||
|
#return w.orth.last3 if w.orth != NULL else 0
|
||||||
|
|
||||||
|
|
||||||
|
#return (<Lexeme*>lex_id).oft_upper
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_oft_title(size_t lex_id):
|
cpdef bint is_oft_title(size_t lex_id):
|
||||||
|
@ -163,4 +141,5 @@ cpdef bint is_oft_title(size_t lex_id):
|
||||||
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
||||||
True
|
True
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).oft_title
|
return False
|
||||||
|
#return (<Lexeme*>lex_id).oft_title
|
||||||
|
|
|
@ -7,16 +7,27 @@ from ext.sparsehash cimport dense_hash_map
|
||||||
# Circular import problems here
|
# Circular import problems here
|
||||||
ctypedef size_t Lexeme_addr
|
ctypedef size_t Lexeme_addr
|
||||||
ctypedef uint64_t StringHash
|
ctypedef uint64_t StringHash
|
||||||
ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
|
ctypedef dense_hash_map[StringHash, size_t] Vocab
|
||||||
ctypedef int (*Splitter)(unicode word, size_t length)
|
from spacy.lexeme cimport Lexeme
|
||||||
|
|
||||||
|
from spacy.tokens cimport Tokens
|
||||||
|
|
||||||
|
# Put these above import to avoid circular import problem
|
||||||
|
ctypedef char Bits8
|
||||||
|
ctypedef uint64_t Bits64
|
||||||
|
ctypedef int ClusterID
|
||||||
|
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from spacy.tokens cimport Tokens
|
from spacy.lexeme cimport Distribution
|
||||||
|
from spacy.lexeme cimport Orthography
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef object name
|
cdef object name
|
||||||
cdef Vocab* vocab
|
cdef Vocab* vocab
|
||||||
|
cdef Vocab* distri
|
||||||
|
cdef Vocab* ortho
|
||||||
cdef dict bacov
|
cdef dict bacov
|
||||||
cdef int find_split(self, unicode word, size_t length)
|
cdef int find_split(self, unicode word, size_t length)
|
||||||
|
|
||||||
|
@ -26,3 +37,8 @@ cdef class Language:
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)
|
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)
|
||||||
|
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
|
||||||
|
int split, size_t length)
|
||||||
|
cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,22 +6,43 @@ from libc.stdlib cimport calloc, free
|
||||||
from ext.murmurhash cimport MurmurHash64A
|
from ext.murmurhash cimport MurmurHash64A
|
||||||
from ext.murmurhash cimport MurmurHash64B
|
from ext.murmurhash cimport MurmurHash64B
|
||||||
|
|
||||||
from spacy.lexeme cimport init_lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from spacy.lexeme cimport BLANK_WORD
|
from spacy.lexeme cimport BLANK_WORD
|
||||||
|
|
||||||
from spacy.string_tools cimport is_whitespace
|
from spacy.string_tools cimport substr
|
||||||
|
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
from os import path
|
from os import path
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
def get_normalized(unicode lex, size_t length):
|
||||||
|
return lex.lower()
|
||||||
|
#if lex.isdigit():
|
||||||
|
# return '!YEAR' if length == 4 else '!DIGIT'
|
||||||
|
#else:
|
||||||
|
# return lex.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def get_word_shape(lex, length):
|
||||||
|
return lex
|
||||||
|
|
||||||
|
|
||||||
|
def set_orth_flags(lex, length):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
def __cinit__(self, name):
|
def __cinit__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.bacov = {}
|
self.bacov = {}
|
||||||
self.vocab = new Vocab()
|
self.vocab = new Vocab()
|
||||||
|
self.ortho = new Vocab()
|
||||||
|
self.distri = new Vocab()
|
||||||
self.vocab[0].set_empty_key(0)
|
self.vocab[0].set_empty_key(0)
|
||||||
|
self.distri[0].set_empty_key(0)
|
||||||
|
self.ortho[0].set_empty_key(0)
|
||||||
self.load_tokenization(util.read_tokenization(name))
|
self.load_tokenization(util.read_tokenization(name))
|
||||||
|
|
||||||
def load_tokenization(self, token_rules=None):
|
def load_tokenization(self, token_rules=None):
|
||||||
|
@ -80,7 +101,7 @@ cdef class Language:
|
||||||
return <Lexeme_addr>word_ptr
|
return <Lexeme_addr>word_ptr
|
||||||
|
|
||||||
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
|
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
|
||||||
word = init_lexeme(self, string, hashed, split, length)
|
word = self.init_lexeme(string, hashed, split, length)
|
||||||
self.vocab[0][hashed] = <Lexeme_addr>word
|
self.vocab[0][hashed] = <Lexeme_addr>word
|
||||||
self.bacov[hashed] = string
|
self.bacov[hashed] = string
|
||||||
return word
|
return word
|
||||||
|
@ -121,6 +142,55 @@ cdef class Language:
|
||||||
cdef int find_split(self, unicode word, size_t length):
|
cdef int find_split(self, unicode word, size_t length):
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
|
||||||
|
int split, size_t length):
|
||||||
|
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
||||||
|
|
||||||
|
word.sic = hashed
|
||||||
|
|
||||||
|
cdef unicode tail_string
|
||||||
|
cdef unicode lex
|
||||||
|
if split != 0 and split < length:
|
||||||
|
lex = substr(string, 0, split, length)
|
||||||
|
tail_string = substr(string, split, length, length)
|
||||||
|
else:
|
||||||
|
lex = string
|
||||||
|
tail_string = ''
|
||||||
|
|
||||||
|
word.lex = self.hash_string(lex, len(lex))
|
||||||
|
self.bacov[word.lex] = lex
|
||||||
|
word.orth = <Orthography*>self.ortho[0][word.lex]
|
||||||
|
if word.orth == NULL:
|
||||||
|
word.orth = self.init_orth(word.lex, lex)
|
||||||
|
word.dist = <Distribution*>self.distri[0][word.lex]
|
||||||
|
|
||||||
|
# Now recurse, and deal with the tail
|
||||||
|
if tail_string:
|
||||||
|
word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
|
||||||
|
return word
|
||||||
|
|
||||||
|
cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
|
||||||
|
cdef Orthography* orth = <Orthography*>calloc(1, sizeof(Orthography))
|
||||||
|
orth.first = <Py_UNICODE>lex[0]
|
||||||
|
|
||||||
|
cdef int length = len(lex)
|
||||||
|
|
||||||
|
orth.flags = set_orth_flags(lex, length)
|
||||||
|
|
||||||
|
cdef unicode last3 = substr(lex, length - 3, length, length)
|
||||||
|
cdef unicode norm = get_normalized(lex, length)
|
||||||
|
cdef unicode shape = get_word_shape(lex, length)
|
||||||
|
|
||||||
|
orth.last3 = self.hash_string(last3, len(last3))
|
||||||
|
orth.shape = self.hash_string(shape, len(shape))
|
||||||
|
orth.norm = self.hash_string(norm, len(norm))
|
||||||
|
|
||||||
|
self.bacov[orth.last3] = last3
|
||||||
|
self.bacov[orth.shape] = shape
|
||||||
|
self.bacov[orth.norm] = norm
|
||||||
|
|
||||||
|
self.ortho[0][hashed] = <size_t>orth
|
||||||
|
return orth
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
||||||
|
@ -137,7 +207,7 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
||||||
cpdef vector[size_t] expand_chunk(size_t addr) except *:
|
cpdef vector[size_t] expand_chunk(size_t addr) except *:
|
||||||
cdef vector[size_t] tokens = vector[size_t]()
|
cdef vector[size_t] tokens = vector[size_t]()
|
||||||
word = <Lexeme*>addr
|
word = <Lexeme*>addr
|
||||||
while word is not NULL:
|
while word != NULL:
|
||||||
tokens.push_back(<size_t>word)
|
tokens.push_back(<size_t>word)
|
||||||
word = word.tail
|
word = word.tail
|
||||||
return tokens
|
return tokens
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode substr(unicode string, int start, int end, size_t length):
|
cpdef unicode substr(unicode string, int start, int end, size_t length):
|
||||||
if end >= length:
|
if end >= length:
|
||||||
end = -1
|
end = -1
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.spacy cimport Lexeme_addr
|
||||||
from spacy.lexeme cimport Lexeme_addr
|
|
||||||
|
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
|
|
|
@ -2,6 +2,11 @@ from cython.operator cimport dereference as deref
|
||||||
from cython.operator cimport preincrement as inc
|
from cython.operator cimport preincrement as inc
|
||||||
|
|
||||||
|
|
||||||
|
from spacy.lexeme cimport Lexeme
|
||||||
|
from spacy.lexeme cimport norm_of
|
||||||
|
from spacy.spacy cimport StringHash
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
def __cinit__(self, Language lang):
|
def __cinit__(self, Language lang):
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
|
@ -38,11 +43,11 @@ cdef class Tokens:
|
||||||
cpdef dict count_by(self, Field attr):
|
cpdef dict count_by(self, Field attr):
|
||||||
counts = {}
|
counts = {}
|
||||||
cdef Lexeme_addr t
|
cdef Lexeme_addr t
|
||||||
cdef Lexeme* word
|
cdef StringHash key
|
||||||
for t in self.vctr[0]:
|
for t in self.vctr[0]:
|
||||||
word = <Lexeme*>t
|
key = norm_of(t)
|
||||||
if word.lex not in counts:
|
if key not in counts:
|
||||||
counts[word.lex] = 0
|
counts[key] = 0
|
||||||
counts[word.lex] += 1
|
counts[key] += 1
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user