mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
* Broken version being refactored for docs
This commit is contained in:
parent
5fddb8d165
commit
a78ad4152d
|
@ -10,6 +10,7 @@ from spacy.tokens cimport Tokens
|
||||||
|
|
||||||
cdef class English(spacy.Language):
|
cdef class English(spacy.Language):
|
||||||
cdef int find_split(self, unicode word)
|
cdef int find_split(self, unicode word)
|
||||||
|
cdef int set_orth(self, unicode word, Lexeme* lex) except -1
|
||||||
|
|
||||||
cdef English EN
|
cdef English EN
|
||||||
|
|
||||||
|
|
66
spacy/en.pyx
66
spacy/en.pyx
|
@ -1,7 +1,8 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
|
# cython: embedsignature=True
|
||||||
so that strings can be retrieved from hashes. Use 64-bit hash values and
|
'''Tokenize English text, allowing some differences from the Penn Treebank
|
||||||
boldly assume no collisions.
|
tokenization, e.g. for email addresses, URLs, etc. Use en_ptb if full PTB
|
||||||
|
compatibility is the priority.
|
||||||
'''
|
'''
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
@ -9,14 +10,17 @@ from libc.stdlib cimport malloc, calloc, free
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from spacy.string_tools cimport substr
|
|
||||||
|
|
||||||
from . import util
|
|
||||||
|
|
||||||
cimport spacy
|
cimport spacy
|
||||||
|
|
||||||
|
|
||||||
|
from spacy.orthography.latin cimport *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class English(spacy.Language):
|
cdef class English(spacy.Language):
|
||||||
|
cdef int set_orth(self, unicode word, Lexeme* lex) except -1:
|
||||||
|
pass
|
||||||
|
|
||||||
cdef int find_split(self, unicode word):
|
cdef int find_split(self, unicode word):
|
||||||
cdef size_t length = len(word)
|
cdef size_t length = len(word)
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
|
@ -26,17 +30,17 @@ cdef class English(spacy.Language):
|
||||||
if word.endswith("'s") and length >= 3:
|
if word.endswith("'s") and length >= 3:
|
||||||
return length - 2
|
return length - 2
|
||||||
# Leading punctuation
|
# Leading punctuation
|
||||||
if is_punct(word, 0, length):
|
if check_punct(word, 0, length):
|
||||||
return 1
|
return 1
|
||||||
elif length >= 1:
|
elif length >= 1:
|
||||||
# Split off all trailing punctuation characters
|
# Split off all trailing punctuation characters
|
||||||
i = 0
|
i = 0
|
||||||
while i < length and not is_punct(word, i, length):
|
while i < length and not check_punct(word, i, length):
|
||||||
i += 1
|
i += 1
|
||||||
return i
|
return i
|
||||||
|
|
||||||
|
|
||||||
cdef bint is_punct(unicode word, size_t i, size_t length):
|
cdef bint check_punct(unicode word, size_t i, size_t length):
|
||||||
# Don't count appostrophes as punct if the next char is a letter
|
# Don't count appostrophes as punct if the next char is a letter
|
||||||
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
||||||
return i == 0
|
return i == 0
|
||||||
|
@ -55,14 +59,52 @@ EN = English('en')
|
||||||
|
|
||||||
|
|
||||||
cpdef Tokens tokenize(unicode string):
|
cpdef Tokens tokenize(unicode string):
|
||||||
|
"""Tokenize a string.
|
||||||
|
|
||||||
|
Wraps EN.tokenize, where EN is an instance of the class English. The global
|
||||||
|
variable manages the vocabulary, and memoizes tokenization rules.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
string (unicode): The string to be split. Must be unicode, not bytes.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tokens (Tokens): A Tokens instance, managing a vector of pointers to
|
||||||
|
Lexeme structs. The Tokens instance supports sequence interfaces,
|
||||||
|
but also offers a range of sequence-level operations, which are computed
|
||||||
|
efficiently in Cython-space.
|
||||||
|
"""
|
||||||
return EN.tokenize(string)
|
return EN.tokenize(string)
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme_addr lookup(unicode string) except 0:
|
cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||||
|
"""Retrieve (or create) a Lexeme for a string.
|
||||||
|
|
||||||
|
Returns a Lexeme ID, which can be used via the accessor
|
||||||
|
methods in spacy.lexeme
|
||||||
|
|
||||||
|
Args:
|
||||||
|
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LexemeID (size_t): An unsigned integer that allows the Lexeme to be retrieved.
|
||||||
|
The LexemeID is really a memory address, making dereferencing it essentially
|
||||||
|
free.
|
||||||
|
"""
|
||||||
return <Lexeme_addr>EN.lookup(string)
|
return <Lexeme_addr>EN.lookup(string)
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode unhash(StringHash hash_value):
|
cpdef unicode unhash(StringHash hash_value):
|
||||||
|
"""Retrieve a string from a hash value. Mostly used for testing.
|
||||||
|
|
||||||
|
In general you should avoid computing with strings, as they are slower than
|
||||||
|
the intended ID-based usage. However, strings can be recovered if necessary,
|
||||||
|
although no control is taken for hash collisions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hash_value (uint32_t): The hash of a string, returned by Python's hash()
|
||||||
|
function.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
string (unicode): A unicode string that hashes to the hash_value.
|
||||||
|
"""
|
||||||
return EN.unhash(hash_value)
|
return EN.unhash(hash_value)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,83 +1,34 @@
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
# Put these above import to avoid circular import problem
|
|
||||||
ctypedef int ClusterID
|
ctypedef int ClusterID
|
||||||
ctypedef uint32_t StringHash
|
ctypedef uint32_t StringHash
|
||||||
ctypedef size_t Lexeme_addr
|
ctypedef size_t LexID
|
||||||
ctypedef char Bits8
|
ctypedef char OrthFlags
|
||||||
ctypedef uint64_t Bits64
|
ctypedef char DistFlags
|
||||||
|
ctypedef uint64_t TagFlags
|
||||||
|
|
||||||
cdef enum OrthFlag:
|
|
||||||
IS_ALPHA
|
|
||||||
IS_DIGIT
|
|
||||||
IS_PUNCT
|
|
||||||
IS_WHITE
|
|
||||||
IS_LOWER
|
|
||||||
IS_UPPER
|
|
||||||
IS_TITLE
|
|
||||||
IS_ASCII
|
|
||||||
|
|
||||||
|
|
||||||
cdef enum DistFlag:
|
|
||||||
OFT_UPPER
|
|
||||||
OFT_TITLE
|
|
||||||
DIST_FLAG3
|
|
||||||
DIST_FLAG4
|
|
||||||
DIST_FLAG5
|
|
||||||
DIST_FLAG6
|
|
||||||
DIST_FLAG7
|
|
||||||
DIST_FLAG8
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct Orthography:
|
|
||||||
StringHash shape
|
|
||||||
StringHash norm
|
|
||||||
StringHash last3
|
|
||||||
Bits8 flags
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct Distribution:
|
|
||||||
double prob
|
|
||||||
ClusterID cluster
|
|
||||||
Bits64 tagdict
|
|
||||||
Bits8 flags
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct Lexeme:
|
cdef struct Lexeme:
|
||||||
|
StringHash lex
|
||||||
char* string
|
char* string
|
||||||
size_t length
|
size_t length
|
||||||
StringHash lex
|
double prob
|
||||||
Orthography orth # Extra orthographic views
|
ClusterID cluster
|
||||||
Distribution dist # Distribution info
|
TagFlags possible_tags
|
||||||
|
DistFlags dist_flags
|
||||||
|
OrthFlags orth_flags
|
||||||
|
StringHash* string_views
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme BLANK_WORD = Lexeme(NULL, 0, 0,
|
cpdef char first_of(LexID lex_id) except 0
|
||||||
Orthography(0, 0, 0, 0),
|
cpdef size_t length_of(LexID lex_id) except 0
|
||||||
Distribution(0.0, 0, 0, 0)
|
cpdef double prob_of(LexID lex_id) except 0
|
||||||
)
|
cpdef ClusterID cluster_of(LexID lex_id) except 0
|
||||||
|
|
||||||
|
cpdef bint check_tag_flag(LexID lex, TagFlags flag) except *
|
||||||
|
cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
|
||||||
|
cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *
|
||||||
|
|
||||||
cdef enum StringAttr:
|
cpdef StringHash view_of(LexID lex_id, size_t view) except 0
|
||||||
LEX
|
|
||||||
NORM
|
|
||||||
SHAPE
|
|
||||||
LAST3
|
|
||||||
LENGTH
|
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
|
|
||||||
|
|
||||||
cpdef StringHash lex_of(size_t lex_id) except 0
|
|
||||||
cpdef StringHash norm_of(size_t lex_id) except 0
|
|
||||||
cpdef StringHash shape_of(size_t lex_id) except 0
|
|
||||||
cpdef StringHash last3_of(size_t lex_id) except 0
|
|
||||||
|
|
||||||
cpdef size_t length_of(size_t lex_id) except *
|
|
||||||
|
|
||||||
cpdef double prob_of(size_t lex_id) except 0
|
|
||||||
cpdef ClusterID cluster_of(size_t lex_id) except 0
|
|
||||||
|
|
||||||
cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *
|
|
||||||
cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *
|
|
||||||
|
|
|
@ -1,32 +1,32 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
|
# cython: embedsignature=True
|
||||||
'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
|
'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
|
||||||
Mostly useful from Python-space. From Cython-space, you can just cast to
|
Mostly useful from Python-space. From Cython-space, you can just cast to
|
||||||
Lexeme* yourself.
|
Lexeme* yourself.
|
||||||
'''
|
'''
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.string_tools cimport substr
|
|
||||||
|
|
||||||
from libc.stdlib cimport malloc, calloc, free
|
from libc.stdlib cimport malloc, calloc, free
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
from libcpp.vector cimport vector
|
|
||||||
|
|
||||||
from spacy.spacy cimport StringHash
|
from spacy.spacy cimport StringHash
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
|
cpdef int set_flags(LexID lex_id, object active_flags) except *:
|
||||||
if attr == LEX:
|
"""Set orthographic bit flags for a Lexeme.
|
||||||
return lex_of(lex_id)
|
|
||||||
elif attr == NORM:
|
Args:
|
||||||
return norm_of(lex_id)
|
lex_id (LexemeID): A reference ID for a Lexeme.
|
||||||
elif attr == SHAPE:
|
active_flags: A sequence of bits to set as True.
|
||||||
return shape_of(lex_id)
|
"""
|
||||||
elif attr == LAST3:
|
cdef size_t flag
|
||||||
return last3_of(lex_id)
|
cdef Lexeme* w = <Lexeme*>lex_id
|
||||||
elif attr == LENGTH:
|
for flag in active_flags:
|
||||||
return length_of(lex_id)
|
w.orth_flags |= 1 << flag
|
||||||
else:
|
|
||||||
raise StandardError
|
|
||||||
|
cpdef StringHash view_of(LexID lex_id, size_t view) except 0:
|
||||||
|
return (<Lexeme*>lex_id).string_views[view]
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash lex_of(size_t lex_id) except 0:
|
cpdef StringHash lex_of(size_t lex_id) except 0:
|
||||||
|
@ -37,42 +37,14 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
|
||||||
delimited tokens split off. The other fields refer to properties of the
|
delimited tokens split off. The other fields refer to properties of the
|
||||||
string that the lex field stores a hash of, except sic and tail.
|
string that the lex field stores a hash of, except sic and tail.
|
||||||
|
|
||||||
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
|
>>> from spacy import en
|
||||||
|
>>> [en.unhash(lex_of(lex_id) for lex_id in en.tokenize(u'Hi! world')]
|
||||||
[u'Hi', u'!', u'world']
|
[u'Hi', u'!', u'world']
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).lex
|
return (<Lexeme*>lex_id).lex
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash norm_of(size_t lex_id) except 0:
|
cpdef ClusterID cluster_of(LexID lex_id) except 0:
|
||||||
'''Access the `lex' field of the Lexeme pointed to by lex_id.
|
|
||||||
|
|
||||||
The lex field is the hash of the string you would expect to get back from
|
|
||||||
a standard tokenizer, i.e. the word with punctuation and other non-whitespace
|
|
||||||
delimited tokens split off. The other fields refer to properties of the
|
|
||||||
string that the lex field stores a hash of, except sic and tail.
|
|
||||||
|
|
||||||
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
|
|
||||||
[u'Hi', u'!', u'world']
|
|
||||||
'''
|
|
||||||
return (<Lexeme*>lex_id).orth.norm
|
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash shape_of(size_t lex_id) except 0:
|
|
||||||
return (<Lexeme*>lex_id).orth.shape
|
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash last3_of(size_t lex_id) except 0:
|
|
||||||
'''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
|
|
||||||
the hash of the last three characters of the word:
|
|
||||||
|
|
||||||
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
|
|
||||||
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
|
||||||
[u'llo', u'!']
|
|
||||||
'''
|
|
||||||
return (<Lexeme*>lex_id).orth.last3
|
|
||||||
|
|
||||||
|
|
||||||
cpdef ClusterID cluster_of(size_t lex_id) except 0:
|
|
||||||
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
|
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
|
||||||
gives an integer representation of the cluster ID of the word,
|
gives an integer representation of the cluster ID of the word,
|
||||||
which should be understood as a binary address:
|
which should be understood as a binary address:
|
||||||
|
@ -88,10 +60,10 @@ cpdef ClusterID cluster_of(size_t lex_id) except 0:
|
||||||
while "dapple" is totally different. On the other hand, "scalable" receives
|
while "dapple" is totally different. On the other hand, "scalable" receives
|
||||||
the same cluster ID as "pineapple", which is not what we'd like.
|
the same cluster ID as "pineapple", which is not what we'd like.
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).dist.cluster
|
return (<Lexeme*>lex_id).cluster
|
||||||
|
|
||||||
|
|
||||||
cpdef Py_UNICODE first_of(size_t lex_id):
|
cpdef char first_of(size_t lex_id) except 0:
|
||||||
'''Access the `first' field of the Lexeme pointed to by lex_id, which
|
'''Access the `first' field of the Lexeme pointed to by lex_id, which
|
||||||
stores the first character of the lex string of the word.
|
stores the first character of the lex string of the word.
|
||||||
|
|
||||||
|
@ -99,10 +71,10 @@ cpdef Py_UNICODE first_of(size_t lex_id):
|
||||||
>>> unhash(first_of(lex_id))
|
>>> unhash(first_of(lex_id))
|
||||||
u'H'
|
u'H'
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).orth.first
|
return (<Lexeme*>lex_id).string[0]
|
||||||
|
|
||||||
|
|
||||||
cpdef size_t length_of(size_t lex_id) except *:
|
cpdef size_t length_of(size_t lex_id) except 0:
|
||||||
'''Access the `length' field of the Lexeme pointed to by lex_id, which stores
|
'''Access the `length' field of the Lexeme pointed to by lex_id, which stores
|
||||||
the length of the string hashed by lex_of.'''
|
the length of the string hashed by lex_of.'''
|
||||||
cdef Lexeme* word = <Lexeme*>lex_id
|
cdef Lexeme* word = <Lexeme*>lex_id
|
||||||
|
@ -119,8 +91,10 @@ cpdef double prob_of(size_t lex_id) except 0:
|
||||||
>>> prob_of(lookup(u'world'))
|
>>> prob_of(lookup(u'world'))
|
||||||
-20.10340371976182
|
-20.10340371976182
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).dist.prob
|
return (<Lexeme*>lex_id).prob
|
||||||
|
|
||||||
|
DEF OFT_UPPER = 1
|
||||||
|
DEF OFT_TITLE = 2
|
||||||
|
|
||||||
cpdef bint is_oft_upper(size_t lex_id):
|
cpdef bint is_oft_upper(size_t lex_id):
|
||||||
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
||||||
|
@ -134,7 +108,7 @@ cpdef bint is_oft_upper(size_t lex_id):
|
||||||
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
||||||
True
|
True
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
|
return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_oft_title(size_t lex_id):
|
cpdef bint is_oft_title(size_t lex_id):
|
||||||
|
@ -149,11 +123,15 @@ cpdef bint is_oft_title(size_t lex_id):
|
||||||
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
||||||
True
|
True
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
|
return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)
|
||||||
|
|
||||||
cpdef bint check_orth_flag(size_t lex_id, OrthFlag flag) except *:
|
cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
|
||||||
return (<Lexeme*>lex_id).orth.flags & (1 << flag)
|
return (<Lexeme*>lex_id).orth_flags & (1 << flag)
|
||||||
|
|
||||||
|
|
||||||
cpdef bint check_dist_flag(size_t lex_id, DistFlag flag) except *:
|
cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
|
||||||
return (<Lexeme*>lex_id).dist.flags & (1 << flag)
|
return (<Lexeme*>lex_id).dist_flags & (1 << flag)
|
||||||
|
|
||||||
|
|
||||||
|
cpdef bint check_tag_flag(LexID lex_id, TagFlags flag) except *:
|
||||||
|
return (<Lexeme*>lex_id).possible_tags & (1 << flag)
|
||||||
|
|
|
@ -19,8 +19,6 @@ ctypedef int ClusterID
|
||||||
|
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from spacy.lexeme cimport Distribution
|
|
||||||
from spacy.lexeme cimport Orthography
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
|
@ -29,7 +27,7 @@ cdef class Language:
|
||||||
cdef dense_hash_map[StringHash, size_t] vocab
|
cdef dense_hash_map[StringHash, size_t] vocab
|
||||||
cdef dict bacov
|
cdef dict bacov
|
||||||
|
|
||||||
cdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
|
|
||||||
cdef Lexeme* lookup(self, unicode string) except NULL
|
cdef Lexeme* lookup(self, unicode string) except NULL
|
||||||
cdef Lexeme** lookup_chunk(self, unicode chunk) except NULL
|
cdef Lexeme** lookup_chunk(self, unicode chunk) except NULL
|
||||||
|
@ -37,7 +35,8 @@ cdef class Language:
|
||||||
cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
|
cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
|
||||||
cdef Lexeme* new_lexeme(self, unicode lex) except NULL
|
cdef Lexeme* new_lexeme(self, unicode lex) except NULL
|
||||||
|
|
||||||
cdef unicode unhash(self, StringHash hashed)
|
cpdef unicode unhash(self, StringHash hashed)
|
||||||
|
|
||||||
cpdef list find_substrings(self, unicode word)
|
cpdef list find_substrings(self, unicode chunk)
|
||||||
cdef int find_split(self, unicode word)
|
cdef int find_split(self, unicode word)
|
||||||
|
cdef int set_orth(self, unicode string, Lexeme* word)
|
||||||
|
|
144
spacy/spacy.pyx
144
spacy/spacy.pyx
|
@ -1,4 +1,13 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
|
# cython: embedsignature=True
|
||||||
|
"""Common classes and utilities across languages.
|
||||||
|
|
||||||
|
Provides the main implementation for the spacy tokenizer. Specific languages
|
||||||
|
subclass the Language class, over-writing the tokenization rules as necessary.
|
||||||
|
Special-case tokenization rules are read from data/<lang>/tokenization .
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
|
@ -6,54 +15,13 @@ from libcpp.pair cimport pair
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from spacy.lexeme cimport BLANK_WORD
|
from spacy.lexeme cimport LexID
|
||||||
|
|
||||||
from spacy.string_tools cimport substr
|
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
DIST_FLAGS = {}
|
|
||||||
TAGS = {}
|
TAGS = {}
|
||||||
|
DIST_FLAGS = {}
|
||||||
|
|
||||||
def get_normalized(unicode lex):
|
|
||||||
if lex.isalpha() and lex.islower():
|
|
||||||
return lex
|
|
||||||
else:
|
|
||||||
return get_word_shape(lex)
|
|
||||||
|
|
||||||
|
|
||||||
def get_word_shape(unicode lex):
|
|
||||||
cdef size_t length = len(lex)
|
|
||||||
shape = ""
|
|
||||||
last = ""
|
|
||||||
shape_char = ""
|
|
||||||
seq = 0
|
|
||||||
for c in lex:
|
|
||||||
if c.isalpha():
|
|
||||||
if c.isupper():
|
|
||||||
shape_char = "X"
|
|
||||||
else:
|
|
||||||
shape_char = "x"
|
|
||||||
elif c.isdigit():
|
|
||||||
shape_char = "d"
|
|
||||||
else:
|
|
||||||
shape_char = c
|
|
||||||
if shape_char == last:
|
|
||||||
seq += 1
|
|
||||||
else:
|
|
||||||
seq = 0
|
|
||||||
last = shape_char
|
|
||||||
if seq < 3:
|
|
||||||
shape += shape_char
|
|
||||||
assert shape
|
|
||||||
return shape
|
|
||||||
|
|
||||||
|
|
||||||
def set_orth_flags(lex):
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
def __cinit__(self, name):
|
def __cinit__(self, name):
|
||||||
|
@ -64,9 +32,19 @@ cdef class Language:
|
||||||
self.chunks.set_empty_key(0)
|
self.chunks.set_empty_key(0)
|
||||||
self.vocab.set_empty_key(0)
|
self.vocab.set_empty_key(0)
|
||||||
self.load_tokenization(util.read_tokenization(name))
|
self.load_tokenization(util.read_tokenization(name))
|
||||||
#self.load_dist_info(util.read_dist_info(name))
|
self.load_dist_info(util.read_dist_info(name))
|
||||||
|
|
||||||
cdef Tokens tokenize(self, unicode string):
|
cpdef Tokens tokenize(self, unicode string):
|
||||||
|
"""Tokenize.
|
||||||
|
|
||||||
|
Split the string into tokens.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
string (unicode): The string to split.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tokens (Tokens): A Tokens object.
|
||||||
|
"""
|
||||||
cdef Lexeme** chunk
|
cdef Lexeme** chunk
|
||||||
cdef Tokens tokens = Tokens(self)
|
cdef Tokens tokens = Tokens(self)
|
||||||
cdef size_t length = len(string)
|
cdef size_t length = len(string)
|
||||||
|
@ -85,8 +63,7 @@ cdef class Language:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef Lexeme* lookup(self, unicode string) except NULL:
|
cdef Lexeme* lookup(self, unicode string) except NULL:
|
||||||
if len(string) == 0:
|
assert len(string) != 0
|
||||||
return &BLANK_WORD
|
|
||||||
cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
|
cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
|
||||||
if word == NULL:
|
if word == NULL:
|
||||||
word = self.new_lexeme(string)
|
word = self.new_lexeme(string)
|
||||||
|
@ -113,56 +90,79 @@ cdef class Language:
|
||||||
cdef bytes byte_string = string.encode('utf8')
|
cdef bytes byte_string = string.encode('utf8')
|
||||||
word.string = <char*>byte_string
|
word.string = <char*>byte_string
|
||||||
word.length = len(byte_string)
|
word.length = len(byte_string)
|
||||||
word.orth.flags = set_orth_flags(string)
|
self.set_orth(string, word)
|
||||||
cdef unicode norm = get_normalized(string)
|
|
||||||
cdef unicode shape = get_word_shape(string)
|
|
||||||
cdef unicode last3 = string[-3:]
|
|
||||||
word.lex = hash(string)
|
|
||||||
word.orth.norm = hash(norm)
|
|
||||||
word.orth.shape = hash(shape)
|
|
||||||
word.orth.last3 = hash(last3)
|
|
||||||
self.bacov[word.lex] = string
|
|
||||||
self.bacov[word.orth.norm] = norm
|
|
||||||
self.bacov[word.orth.shape] = shape
|
|
||||||
self.bacov[word.orth.last3] = last3
|
|
||||||
|
|
||||||
self.vocab[hash(string)] = <size_t>word
|
word.lex = hash(string)
|
||||||
|
self.bacov[word.lex] = string
|
||||||
|
self.vocab[word.lex] = <LexID>word
|
||||||
return word
|
return word
|
||||||
|
|
||||||
cdef unicode unhash(self, StringHash hash_value):
|
cpdef unicode unhash(self, StringHash hash_value):
|
||||||
'''Fetch a string from the reverse index, given its hash value.'''
|
'''Fetch a string from the reverse index, given its hash value.'''
|
||||||
return self.bacov[hash_value]
|
return self.bacov[hash_value]
|
||||||
|
|
||||||
cpdef list find_substrings(self, unicode word):
|
cpdef list find_substrings(self, unicode chunk):
|
||||||
|
"""Find how to split a chunk into substrings.
|
||||||
|
|
||||||
|
This method calls find_split repeatedly. Most languages will want to
|
||||||
|
override find_split, but it may be useful to override this instead.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunk (unicode): The string to be split, e.g. u"Mike's!"
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
|
||||||
|
"""
|
||||||
substrings = []
|
substrings = []
|
||||||
while word:
|
while chunk:
|
||||||
split = self.find_split(word)
|
split = self.find_split(chunk)
|
||||||
if split == 0:
|
if split == 0:
|
||||||
substrings.append(word)
|
substrings.append(chunk)
|
||||||
break
|
break
|
||||||
substrings.append(word[:split])
|
substrings.append(chunk[:split])
|
||||||
word = word[split:]
|
chunk = chunk[split:]
|
||||||
return substrings
|
return substrings
|
||||||
|
|
||||||
cdef int find_split(self, unicode word):
|
cdef int find_split(self, unicode word):
|
||||||
return len(word)
|
return len(word)
|
||||||
|
|
||||||
def load_tokenization(self, token_rules=None):
|
cdef int set_orth(self, unicode string, Lexeme* word):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def load_tokenization(self, token_rules):
|
||||||
|
'''Load special-case tokenization rules.
|
||||||
|
|
||||||
|
Loads special-case tokenization rules into the Language.chunk cache,
|
||||||
|
read from data/<lang>/tokenization . The special cases are loaded before
|
||||||
|
any language data is tokenized, giving these priority. For instance,
|
||||||
|
the English tokenization rules map "ain't" to ["are", "not"].
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
||||||
|
a string and tokens is a list of strings.
|
||||||
|
'''
|
||||||
for chunk, tokens in token_rules:
|
for chunk, tokens in token_rules:
|
||||||
self.new_chunk(chunk, tokens)
|
self.new_chunk(chunk, tokens)
|
||||||
|
|
||||||
def load_dist_info(self, dist_info):
|
def load_dist_info(self, dist_info):
|
||||||
|
'''Load distributional information for the known lexemes of the language.
|
||||||
|
|
||||||
|
The distributional information is read from data/<lang>/dist_info.json .
|
||||||
|
It contains information like the (smoothed) unigram log probability of
|
||||||
|
the word, how often the word is found upper-cased, how often the word
|
||||||
|
is found title-cased, etc.
|
||||||
|
'''
|
||||||
cdef unicode string
|
cdef unicode string
|
||||||
cdef dict word_dist
|
cdef dict word_dist
|
||||||
cdef Lexeme* w
|
cdef Lexeme* w
|
||||||
for string, word_dist in dist_info.items():
|
for string, word_dist in dist_info.items():
|
||||||
w = self.lookup(string)
|
w = self.lookup(string)
|
||||||
w.dist.prob = word_dist.prob
|
w.prob = word_dist.prob
|
||||||
w.dist.cluster = word_dist.cluster
|
w.cluster = word_dist.cluster
|
||||||
for flag in word_dist.flags:
|
for flag in word_dist.flags:
|
||||||
w.dist.flags |= DIST_FLAGS[flag]
|
w.dist_flags |= DIST_FLAGS[flag]
|
||||||
for tag in word_dist.tagdict:
|
for tag in word_dist.tagdict:
|
||||||
w.dist.tagdict |= TAGS[tag]
|
w.possible_tags |= TAGS[tag]
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
||||||
|
|
|
@ -4,7 +4,6 @@ from spacy.lexeme cimport Lexeme
|
||||||
|
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
from spacy.lexeme cimport StringAttr
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
|
@ -15,5 +14,5 @@ cdef class Tokens:
|
||||||
cpdef int append(self, Lexeme_addr token)
|
cpdef int append(self, Lexeme_addr token)
|
||||||
cpdef int extend(self, Tokens other) except -1
|
cpdef int extend(self, Tokens other) except -1
|
||||||
|
|
||||||
cpdef object group_by(self, StringAttr attr)
|
cpdef object group_by(self, size_t attr)
|
||||||
cpdef dict count_by(self, StringAttr attr)
|
cpdef dict count_by(self, size_t attr)
|
||||||
|
|
|
@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc
|
||||||
|
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
|
#from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
|
||||||
from spacy.spacy cimport StringHash
|
from spacy.spacy cimport StringHash
|
||||||
|
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ cdef class Tokens:
|
||||||
for el in other:
|
for el in other:
|
||||||
self.append(el)
|
self.append(el)
|
||||||
|
|
||||||
cpdef object group_by(self, StringAttr attr):
|
cpdef object group_by(self, size_t attr):
|
||||||
'''Group tokens that share the property attr into Tokens instances, and
|
'''Group tokens that share the property attr into Tokens instances, and
|
||||||
return a list of them. Returns a tuple of three lists:
|
return a list of them. Returns a tuple of three lists:
|
||||||
|
|
||||||
|
@ -66,7 +66,8 @@ cdef class Tokens:
|
||||||
cdef StringHash key
|
cdef StringHash key
|
||||||
cdef Lexeme_addr t
|
cdef Lexeme_addr t
|
||||||
for t in self.vctr[0]:
|
for t in self.vctr[0]:
|
||||||
key = attr_of(t, attr)
|
#key = attr_of(t, attr)
|
||||||
|
key = 0
|
||||||
if key in indices:
|
if key in indices:
|
||||||
groups[indices[key]].append(t)
|
groups[indices[key]].append(t)
|
||||||
else:
|
else:
|
||||||
|
@ -77,12 +78,13 @@ cdef class Tokens:
|
||||||
groups[-1].append(t)
|
groups[-1].append(t)
|
||||||
return names, hashes, groups
|
return names, hashes, groups
|
||||||
|
|
||||||
cpdef dict count_by(self, StringAttr attr):
|
cpdef dict count_by(self, size_t attr):
|
||||||
counts = {}
|
counts = {}
|
||||||
cdef Lexeme_addr t
|
cdef Lexeme_addr t
|
||||||
cdef StringHash key
|
cdef StringHash key
|
||||||
for t in self.vctr[0]:
|
for t in self.vctr[0]:
|
||||||
key = attr_of(t, attr)
|
#key = attr_of(t, attr)
|
||||||
|
key = 0
|
||||||
if key not in counts:
|
if key not in counts:
|
||||||
counts[key] = 0
|
counts[key] = 0
|
||||||
counts[key] += 1
|
counts[key] += 1
|
||||||
|
|
Loading…
Reference in New Issue
Block a user