mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Broken version being refactored for docs
This commit is contained in:
parent
5fddb8d165
commit
a78ad4152d
|
@ -10,6 +10,7 @@ from spacy.tokens cimport Tokens
|
|||
|
||||
cdef class English(spacy.Language):
|
||||
cdef int find_split(self, unicode word)
|
||||
cdef int set_orth(self, unicode word, Lexeme* lex) except -1
|
||||
|
||||
cdef English EN
|
||||
|
||||
|
|
66
spacy/en.pyx
66
spacy/en.pyx
|
@ -1,7 +1,8 @@
|
|||
# cython: profile=True
|
||||
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
|
||||
so that strings can be retrieved from hashes. Use 64-bit hash values and
|
||||
boldly assume no collisions.
|
||||
# cython: embedsignature=True
|
||||
'''Tokenize English text, allowing some differences from the Penn Treebank
|
||||
tokenization, e.g. for email addresses, URLs, etc. Use en_ptb if full PTB
|
||||
compatibility is the priority.
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
@ -9,14 +10,17 @@ from libc.stdlib cimport malloc, calloc, free
|
|||
from libc.stdint cimport uint64_t
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from spacy.string_tools cimport substr
|
||||
|
||||
from . import util
|
||||
|
||||
cimport spacy
|
||||
|
||||
|
||||
from spacy.orthography.latin cimport *
|
||||
|
||||
|
||||
|
||||
cdef class English(spacy.Language):
|
||||
cdef int set_orth(self, unicode word, Lexeme* lex) except -1:
|
||||
pass
|
||||
|
||||
cdef int find_split(self, unicode word):
|
||||
cdef size_t length = len(word)
|
||||
cdef int i = 0
|
||||
|
@ -26,17 +30,17 @@ cdef class English(spacy.Language):
|
|||
if word.endswith("'s") and length >= 3:
|
||||
return length - 2
|
||||
# Leading punctuation
|
||||
if is_punct(word, 0, length):
|
||||
if check_punct(word, 0, length):
|
||||
return 1
|
||||
elif length >= 1:
|
||||
# Split off all trailing punctuation characters
|
||||
i = 0
|
||||
while i < length and not is_punct(word, i, length):
|
||||
while i < length and not check_punct(word, i, length):
|
||||
i += 1
|
||||
return i
|
||||
|
||||
|
||||
cdef bint is_punct(unicode word, size_t i, size_t length):
|
||||
cdef bint check_punct(unicode word, size_t i, size_t length):
|
||||
# Don't count appostrophes as punct if the next char is a letter
|
||||
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
||||
return i == 0
|
||||
|
@ -55,14 +59,52 @@ EN = English('en')
|
|||
|
||||
|
||||
cpdef Tokens tokenize(unicode string):
|
||||
"""Tokenize a string.
|
||||
|
||||
Wraps EN.tokenize, where EN is an instance of the class English. The global
|
||||
variable manages the vocabulary, and memoizes tokenization rules.
|
||||
|
||||
Args:
|
||||
string (unicode): The string to be split. Must be unicode, not bytes.
|
||||
|
||||
Returns:
|
||||
tokens (Tokens): A Tokens instance, managing a vector of pointers to
|
||||
Lexeme structs. The Tokens instance supports sequence interfaces,
|
||||
but also offers a range of sequence-level operations, which are computed
|
||||
efficiently in Cython-space.
|
||||
"""
|
||||
return EN.tokenize(string)
|
||||
|
||||
|
||||
cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||
"""Retrieve (or create) a Lexeme for a string.
|
||||
|
||||
Returns a Lexeme ID, which can be used via the accessor
|
||||
methods in spacy.lexeme
|
||||
|
||||
Args:
|
||||
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
||||
|
||||
Returns:
|
||||
LexemeID (size_t): An unsigned integer that allows the Lexeme to be retrieved.
|
||||
The LexemeID is really a memory address, making dereferencing it essentially
|
||||
free.
|
||||
"""
|
||||
return <Lexeme_addr>EN.lookup(string)
|
||||
|
||||
|
||||
cpdef unicode unhash(StringHash hash_value):
|
||||
"""Retrieve a string from a hash value. Mostly used for testing.
|
||||
|
||||
In general you should avoid computing with strings, as they are slower than
|
||||
the intended ID-based usage. However, strings can be recovered if necessary,
|
||||
although no control is taken for hash collisions.
|
||||
|
||||
Args:
|
||||
hash_value (uint32_t): The hash of a string, returned by Python's hash()
|
||||
function.
|
||||
|
||||
Returns:
|
||||
string (unicode): A unicode string that hashes to the hash_value.
|
||||
"""
|
||||
return EN.unhash(hash_value)
|
||||
|
||||
|
||||
|
|
|
@ -1,83 +1,34 @@
|
|||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
# Put these above import to avoid circular import problem
|
||||
|
||||
ctypedef int ClusterID
|
||||
ctypedef uint32_t StringHash
|
||||
ctypedef size_t Lexeme_addr
|
||||
ctypedef char Bits8
|
||||
ctypedef uint64_t Bits64
|
||||
|
||||
|
||||
cdef enum OrthFlag:
|
||||
IS_ALPHA
|
||||
IS_DIGIT
|
||||
IS_PUNCT
|
||||
IS_WHITE
|
||||
IS_LOWER
|
||||
IS_UPPER
|
||||
IS_TITLE
|
||||
IS_ASCII
|
||||
|
||||
|
||||
cdef enum DistFlag:
|
||||
OFT_UPPER
|
||||
OFT_TITLE
|
||||
DIST_FLAG3
|
||||
DIST_FLAG4
|
||||
DIST_FLAG5
|
||||
DIST_FLAG6
|
||||
DIST_FLAG7
|
||||
DIST_FLAG8
|
||||
|
||||
|
||||
cdef struct Orthography:
|
||||
StringHash shape
|
||||
StringHash norm
|
||||
StringHash last3
|
||||
Bits8 flags
|
||||
|
||||
|
||||
cdef struct Distribution:
|
||||
double prob
|
||||
ClusterID cluster
|
||||
Bits64 tagdict
|
||||
Bits8 flags
|
||||
ctypedef size_t LexID
|
||||
ctypedef char OrthFlags
|
||||
ctypedef char DistFlags
|
||||
ctypedef uint64_t TagFlags
|
||||
|
||||
|
||||
cdef struct Lexeme:
|
||||
StringHash lex
|
||||
char* string
|
||||
size_t length
|
||||
StringHash lex
|
||||
Orthography orth # Extra orthographic views
|
||||
Distribution dist # Distribution info
|
||||
double prob
|
||||
ClusterID cluster
|
||||
TagFlags possible_tags
|
||||
DistFlags dist_flags
|
||||
OrthFlags orth_flags
|
||||
StringHash* string_views
|
||||
|
||||
|
||||
cdef Lexeme BLANK_WORD = Lexeme(NULL, 0, 0,
|
||||
Orthography(0, 0, 0, 0),
|
||||
Distribution(0.0, 0, 0, 0)
|
||||
)
|
||||
cpdef char first_of(LexID lex_id) except 0
|
||||
cpdef size_t length_of(LexID lex_id) except 0
|
||||
cpdef double prob_of(LexID lex_id) except 0
|
||||
cpdef ClusterID cluster_of(LexID lex_id) except 0
|
||||
|
||||
cpdef bint check_tag_flag(LexID lex, TagFlags flag) except *
|
||||
cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
|
||||
cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *
|
||||
|
||||
cdef enum StringAttr:
|
||||
LEX
|
||||
NORM
|
||||
SHAPE
|
||||
LAST3
|
||||
LENGTH
|
||||
|
||||
|
||||
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
|
||||
|
||||
cpdef StringHash lex_of(size_t lex_id) except 0
|
||||
cpdef StringHash norm_of(size_t lex_id) except 0
|
||||
cpdef StringHash shape_of(size_t lex_id) except 0
|
||||
cpdef StringHash last3_of(size_t lex_id) except 0
|
||||
|
||||
cpdef size_t length_of(size_t lex_id) except *
|
||||
|
||||
cpdef double prob_of(size_t lex_id) except 0
|
||||
cpdef ClusterID cluster_of(size_t lex_id) except 0
|
||||
|
||||
cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *
|
||||
cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *
|
||||
cpdef StringHash view_of(LexID lex_id, size_t view) except 0
|
||||
|
|
|
@ -1,32 +1,32 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
|
||||
Mostly useful from Python-space. From Cython-space, you can just cast to
|
||||
Lexeme* yourself.
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.string_tools cimport substr
|
||||
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
from libc.stdint cimport uint64_t
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from spacy.spacy cimport StringHash
|
||||
|
||||
|
||||
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
|
||||
if attr == LEX:
|
||||
return lex_of(lex_id)
|
||||
elif attr == NORM:
|
||||
return norm_of(lex_id)
|
||||
elif attr == SHAPE:
|
||||
return shape_of(lex_id)
|
||||
elif attr == LAST3:
|
||||
return last3_of(lex_id)
|
||||
elif attr == LENGTH:
|
||||
return length_of(lex_id)
|
||||
else:
|
||||
raise StandardError
|
||||
cpdef int set_flags(LexID lex_id, object active_flags) except *:
|
||||
"""Set orthographic bit flags for a Lexeme.
|
||||
|
||||
Args:
|
||||
lex_id (LexemeID): A reference ID for a Lexeme.
|
||||
active_flags: A sequence of bits to set as True.
|
||||
"""
|
||||
cdef size_t flag
|
||||
cdef Lexeme* w = <Lexeme*>lex_id
|
||||
for flag in active_flags:
|
||||
w.orth_flags |= 1 << flag
|
||||
|
||||
|
||||
cpdef StringHash view_of(LexID lex_id, size_t view) except 0:
|
||||
return (<Lexeme*>lex_id).string_views[view]
|
||||
|
||||
|
||||
cpdef StringHash lex_of(size_t lex_id) except 0:
|
||||
|
@ -37,42 +37,14 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
|
|||
delimited tokens split off. The other fields refer to properties of the
|
||||
string that the lex field stores a hash of, except sic and tail.
|
||||
|
||||
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
|
||||
>>> from spacy import en
|
||||
>>> [en.unhash(lex_of(lex_id) for lex_id in en.tokenize(u'Hi! world')]
|
||||
[u'Hi', u'!', u'world']
|
||||
'''
|
||||
return (<Lexeme*>lex_id).lex
|
||||
|
||||
|
||||
cpdef StringHash norm_of(size_t lex_id) except 0:
|
||||
'''Access the `lex' field of the Lexeme pointed to by lex_id.
|
||||
|
||||
The lex field is the hash of the string you would expect to get back from
|
||||
a standard tokenizer, i.e. the word with punctuation and other non-whitespace
|
||||
delimited tokens split off. The other fields refer to properties of the
|
||||
string that the lex field stores a hash of, except sic and tail.
|
||||
|
||||
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
|
||||
[u'Hi', u'!', u'world']
|
||||
'''
|
||||
return (<Lexeme*>lex_id).orth.norm
|
||||
|
||||
|
||||
cpdef StringHash shape_of(size_t lex_id) except 0:
|
||||
return (<Lexeme*>lex_id).orth.shape
|
||||
|
||||
|
||||
cpdef StringHash last3_of(size_t lex_id) except 0:
|
||||
'''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
|
||||
the hash of the last three characters of the word:
|
||||
|
||||
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
|
||||
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
||||
[u'llo', u'!']
|
||||
'''
|
||||
return (<Lexeme*>lex_id).orth.last3
|
||||
|
||||
|
||||
cpdef ClusterID cluster_of(size_t lex_id) except 0:
|
||||
cpdef ClusterID cluster_of(LexID lex_id) except 0:
|
||||
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
|
||||
gives an integer representation of the cluster ID of the word,
|
||||
which should be understood as a binary address:
|
||||
|
@ -88,10 +60,10 @@ cpdef ClusterID cluster_of(size_t lex_id) except 0:
|
|||
while "dapple" is totally different. On the other hand, "scalable" receives
|
||||
the same cluster ID as "pineapple", which is not what we'd like.
|
||||
'''
|
||||
return (<Lexeme*>lex_id).dist.cluster
|
||||
return (<Lexeme*>lex_id).cluster
|
||||
|
||||
|
||||
cpdef Py_UNICODE first_of(size_t lex_id):
|
||||
cpdef char first_of(size_t lex_id) except 0:
|
||||
'''Access the `first' field of the Lexeme pointed to by lex_id, which
|
||||
stores the first character of the lex string of the word.
|
||||
|
||||
|
@ -99,10 +71,10 @@ cpdef Py_UNICODE first_of(size_t lex_id):
|
|||
>>> unhash(first_of(lex_id))
|
||||
u'H'
|
||||
'''
|
||||
return (<Lexeme*>lex_id).orth.first
|
||||
return (<Lexeme*>lex_id).string[0]
|
||||
|
||||
|
||||
cpdef size_t length_of(size_t lex_id) except *:
|
||||
cpdef size_t length_of(size_t lex_id) except 0:
|
||||
'''Access the `length' field of the Lexeme pointed to by lex_id, which stores
|
||||
the length of the string hashed by lex_of.'''
|
||||
cdef Lexeme* word = <Lexeme*>lex_id
|
||||
|
@ -119,8 +91,10 @@ cpdef double prob_of(size_t lex_id) except 0:
|
|||
>>> prob_of(lookup(u'world'))
|
||||
-20.10340371976182
|
||||
'''
|
||||
return (<Lexeme*>lex_id).dist.prob
|
||||
return (<Lexeme*>lex_id).prob
|
||||
|
||||
DEF OFT_UPPER = 1
|
||||
DEF OFT_TITLE = 2
|
||||
|
||||
cpdef bint is_oft_upper(size_t lex_id):
|
||||
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
||||
|
@ -134,7 +108,7 @@ cpdef bint is_oft_upper(size_t lex_id):
|
|||
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
||||
True
|
||||
'''
|
||||
return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
|
||||
return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)
|
||||
|
||||
|
||||
cpdef bint is_oft_title(size_t lex_id):
|
||||
|
@ -149,11 +123,15 @@ cpdef bint is_oft_title(size_t lex_id):
|
|||
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
||||
True
|
||||
'''
|
||||
return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
|
||||
return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)
|
||||
|
||||
cpdef bint check_orth_flag(size_t lex_id, OrthFlag flag) except *:
|
||||
return (<Lexeme*>lex_id).orth.flags & (1 << flag)
|
||||
cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
|
||||
return (<Lexeme*>lex_id).orth_flags & (1 << flag)
|
||||
|
||||
|
||||
cpdef bint check_dist_flag(size_t lex_id, DistFlag flag) except *:
|
||||
return (<Lexeme*>lex_id).dist.flags & (1 << flag)
|
||||
cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
|
||||
return (<Lexeme*>lex_id).dist_flags & (1 << flag)
|
||||
|
||||
|
||||
cpdef bint check_tag_flag(LexID lex_id, TagFlags flag) except *:
|
||||
return (<Lexeme*>lex_id).possible_tags & (1 << flag)
|
||||
|
|
|
@ -19,8 +19,6 @@ ctypedef int ClusterID
|
|||
|
||||
|
||||
from spacy.lexeme cimport Lexeme
|
||||
from spacy.lexeme cimport Distribution
|
||||
from spacy.lexeme cimport Orthography
|
||||
|
||||
|
||||
cdef class Language:
|
||||
|
@ -29,7 +27,7 @@ cdef class Language:
|
|||
cdef dense_hash_map[StringHash, size_t] vocab
|
||||
cdef dict bacov
|
||||
|
||||
cdef Tokens tokenize(self, unicode text)
|
||||
cpdef Tokens tokenize(self, unicode text)
|
||||
|
||||
cdef Lexeme* lookup(self, unicode string) except NULL
|
||||
cdef Lexeme** lookup_chunk(self, unicode chunk) except NULL
|
||||
|
@ -37,7 +35,8 @@ cdef class Language:
|
|||
cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
|
||||
cdef Lexeme* new_lexeme(self, unicode lex) except NULL
|
||||
|
||||
cdef unicode unhash(self, StringHash hashed)
|
||||
cpdef unicode unhash(self, StringHash hashed)
|
||||
|
||||
cpdef list find_substrings(self, unicode word)
|
||||
cpdef list find_substrings(self, unicode chunk)
|
||||
cdef int find_split(self, unicode word)
|
||||
cdef int set_orth(self, unicode string, Lexeme* word)
|
||||
|
|
144
spacy/spacy.pyx
144
spacy/spacy.pyx
|
@ -1,4 +1,13 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
"""Common classes and utilities across languages.
|
||||
|
||||
Provides the main implementation for the spacy tokenizer. Specific languages
|
||||
subclass the Language class, over-writing the tokenization rules as necessary.
|
||||
Special-case tokenization rules are read from data/<lang>/tokenization .
|
||||
"""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.stdlib cimport calloc, free
|
||||
|
@ -6,54 +15,13 @@ from libcpp.pair cimport pair
|
|||
from cython.operator cimport dereference as deref
|
||||
|
||||
from spacy.lexeme cimport Lexeme
|
||||
from spacy.lexeme cimport BLANK_WORD
|
||||
|
||||
from spacy.string_tools cimport substr
|
||||
from spacy.lexeme cimport LexID
|
||||
|
||||
from . import util
|
||||
from os import path
|
||||
|
||||
DIST_FLAGS = {}
|
||||
TAGS = {}
|
||||
|
||||
|
||||
def get_normalized(unicode lex):
|
||||
if lex.isalpha() and lex.islower():
|
||||
return lex
|
||||
else:
|
||||
return get_word_shape(lex)
|
||||
|
||||
|
||||
def get_word_shape(unicode lex):
|
||||
cdef size_t length = len(lex)
|
||||
shape = ""
|
||||
last = ""
|
||||
shape_char = ""
|
||||
seq = 0
|
||||
for c in lex:
|
||||
if c.isalpha():
|
||||
if c.isupper():
|
||||
shape_char = "X"
|
||||
else:
|
||||
shape_char = "x"
|
||||
elif c.isdigit():
|
||||
shape_char = "d"
|
||||
else:
|
||||
shape_char = c
|
||||
if shape_char == last:
|
||||
seq += 1
|
||||
else:
|
||||
seq = 0
|
||||
last = shape_char
|
||||
if seq < 3:
|
||||
shape += shape_char
|
||||
assert shape
|
||||
return shape
|
||||
|
||||
|
||||
def set_orth_flags(lex):
|
||||
return 0
|
||||
|
||||
DIST_FLAGS = {}
|
||||
|
||||
cdef class Language:
|
||||
def __cinit__(self, name):
|
||||
|
@ -64,9 +32,19 @@ cdef class Language:
|
|||
self.chunks.set_empty_key(0)
|
||||
self.vocab.set_empty_key(0)
|
||||
self.load_tokenization(util.read_tokenization(name))
|
||||
#self.load_dist_info(util.read_dist_info(name))
|
||||
self.load_dist_info(util.read_dist_info(name))
|
||||
|
||||
cdef Tokens tokenize(self, unicode string):
|
||||
cpdef Tokens tokenize(self, unicode string):
|
||||
"""Tokenize.
|
||||
|
||||
Split the string into tokens.
|
||||
|
||||
Args:
|
||||
string (unicode): The string to split.
|
||||
|
||||
Returns:
|
||||
tokens (Tokens): A Tokens object.
|
||||
"""
|
||||
cdef Lexeme** chunk
|
||||
cdef Tokens tokens = Tokens(self)
|
||||
cdef size_t length = len(string)
|
||||
|
@ -85,8 +63,7 @@ cdef class Language:
|
|||
return tokens
|
||||
|
||||
cdef Lexeme* lookup(self, unicode string) except NULL:
|
||||
if len(string) == 0:
|
||||
return &BLANK_WORD
|
||||
assert len(string) != 0
|
||||
cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
|
||||
if word == NULL:
|
||||
word = self.new_lexeme(string)
|
||||
|
@ -113,56 +90,79 @@ cdef class Language:
|
|||
cdef bytes byte_string = string.encode('utf8')
|
||||
word.string = <char*>byte_string
|
||||
word.length = len(byte_string)
|
||||
word.orth.flags = set_orth_flags(string)
|
||||
cdef unicode norm = get_normalized(string)
|
||||
cdef unicode shape = get_word_shape(string)
|
||||
cdef unicode last3 = string[-3:]
|
||||
word.lex = hash(string)
|
||||
word.orth.norm = hash(norm)
|
||||
word.orth.shape = hash(shape)
|
||||
word.orth.last3 = hash(last3)
|
||||
self.bacov[word.lex] = string
|
||||
self.bacov[word.orth.norm] = norm
|
||||
self.bacov[word.orth.shape] = shape
|
||||
self.bacov[word.orth.last3] = last3
|
||||
self.set_orth(string, word)
|
||||
|
||||
self.vocab[hash(string)] = <size_t>word
|
||||
word.lex = hash(string)
|
||||
self.bacov[word.lex] = string
|
||||
self.vocab[word.lex] = <LexID>word
|
||||
return word
|
||||
|
||||
cdef unicode unhash(self, StringHash hash_value):
|
||||
cpdef unicode unhash(self, StringHash hash_value):
|
||||
'''Fetch a string from the reverse index, given its hash value.'''
|
||||
return self.bacov[hash_value]
|
||||
|
||||
cpdef list find_substrings(self, unicode word):
|
||||
cpdef list find_substrings(self, unicode chunk):
|
||||
"""Find how to split a chunk into substrings.
|
||||
|
||||
This method calls find_split repeatedly. Most languages will want to
|
||||
override find_split, but it may be useful to override this instead.
|
||||
|
||||
Args:
|
||||
chunk (unicode): The string to be split, e.g. u"Mike's!"
|
||||
|
||||
Returns:
|
||||
substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
|
||||
"""
|
||||
substrings = []
|
||||
while word:
|
||||
split = self.find_split(word)
|
||||
while chunk:
|
||||
split = self.find_split(chunk)
|
||||
if split == 0:
|
||||
substrings.append(word)
|
||||
substrings.append(chunk)
|
||||
break
|
||||
substrings.append(word[:split])
|
||||
word = word[split:]
|
||||
substrings.append(chunk[:split])
|
||||
chunk = chunk[split:]
|
||||
return substrings
|
||||
|
||||
cdef int find_split(self, unicode word):
|
||||
return len(word)
|
||||
|
||||
def load_tokenization(self, token_rules=None):
|
||||
cdef int set_orth(self, unicode string, Lexeme* word):
|
||||
pass
|
||||
|
||||
def load_tokenization(self, token_rules):
|
||||
'''Load special-case tokenization rules.
|
||||
|
||||
Loads special-case tokenization rules into the Language.chunk cache,
|
||||
read from data/<lang>/tokenization . The special cases are loaded before
|
||||
any language data is tokenized, giving these priority. For instance,
|
||||
the English tokenization rules map "ain't" to ["are", "not"].
|
||||
|
||||
Args:
|
||||
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
||||
a string and tokens is a list of strings.
|
||||
'''
|
||||
for chunk, tokens in token_rules:
|
||||
self.new_chunk(chunk, tokens)
|
||||
|
||||
def load_dist_info(self, dist_info):
|
||||
'''Load distributional information for the known lexemes of the language.
|
||||
|
||||
The distributional information is read from data/<lang>/dist_info.json .
|
||||
It contains information like the (smoothed) unigram log probability of
|
||||
the word, how often the word is found upper-cased, how often the word
|
||||
is found title-cased, etc.
|
||||
'''
|
||||
cdef unicode string
|
||||
cdef dict word_dist
|
||||
cdef Lexeme* w
|
||||
for string, word_dist in dist_info.items():
|
||||
w = self.lookup(string)
|
||||
w.dist.prob = word_dist.prob
|
||||
w.dist.cluster = word_dist.cluster
|
||||
w.prob = word_dist.prob
|
||||
w.cluster = word_dist.cluster
|
||||
for flag in word_dist.flags:
|
||||
w.dist.flags |= DIST_FLAGS[flag]
|
||||
w.dist_flags |= DIST_FLAGS[flag]
|
||||
for tag in word_dist.tagdict:
|
||||
w.dist.tagdict |= TAGS[tag]
|
||||
w.possible_tags |= TAGS[tag]
|
||||
|
||||
|
||||
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
||||
|
|
|
@ -4,7 +4,6 @@ from spacy.lexeme cimport Lexeme
|
|||
|
||||
from cython.operator cimport dereference as deref
|
||||
from spacy.spacy cimport Language
|
||||
from spacy.lexeme cimport StringAttr
|
||||
|
||||
|
||||
cdef class Tokens:
|
||||
|
@ -15,5 +14,5 @@ cdef class Tokens:
|
|||
cpdef int append(self, Lexeme_addr token)
|
||||
cpdef int extend(self, Tokens other) except -1
|
||||
|
||||
cpdef object group_by(self, StringAttr attr)
|
||||
cpdef dict count_by(self, StringAttr attr)
|
||||
cpdef object group_by(self, size_t attr)
|
||||
cpdef dict count_by(self, size_t attr)
|
||||
|
|
|
@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc
|
|||
|
||||
|
||||
from spacy.lexeme cimport Lexeme
|
||||
from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
|
||||
#from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
|
||||
from spacy.spacy cimport StringHash
|
||||
|
||||
|
||||
|
@ -37,7 +37,7 @@ cdef class Tokens:
|
|||
for el in other:
|
||||
self.append(el)
|
||||
|
||||
cpdef object group_by(self, StringAttr attr):
|
||||
cpdef object group_by(self, size_t attr):
|
||||
'''Group tokens that share the property attr into Tokens instances, and
|
||||
return a list of them. Returns a tuple of three lists:
|
||||
|
||||
|
@ -66,7 +66,8 @@ cdef class Tokens:
|
|||
cdef StringHash key
|
||||
cdef Lexeme_addr t
|
||||
for t in self.vctr[0]:
|
||||
key = attr_of(t, attr)
|
||||
#key = attr_of(t, attr)
|
||||
key = 0
|
||||
if key in indices:
|
||||
groups[indices[key]].append(t)
|
||||
else:
|
||||
|
@ -77,12 +78,13 @@ cdef class Tokens:
|
|||
groups[-1].append(t)
|
||||
return names, hashes, groups
|
||||
|
||||
cpdef dict count_by(self, StringAttr attr):
|
||||
cpdef dict count_by(self, size_t attr):
|
||||
counts = {}
|
||||
cdef Lexeme_addr t
|
||||
cdef StringHash key
|
||||
for t in self.vctr[0]:
|
||||
key = attr_of(t, attr)
|
||||
#key = attr_of(t, attr)
|
||||
key = 0
|
||||
if key not in counts:
|
||||
counts[key] = 0
|
||||
counts[key] += 1
|
||||
|
|
Loading…
Reference in New Issue
Block a user