mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* More refactoring
This commit is contained in:
parent
88095666dc
commit
68bae2fec6
10
setup.py
10
setup.py
|
@ -45,13 +45,13 @@ else:
|
||||||
|
|
||||||
|
|
||||||
exts = [
|
exts = [
|
||||||
#Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.lang", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
|
|
||||||
include_dirs=includes),
|
|
||||||
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
|
|
||||||
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
|
|
||||||
Extension("spacy.word", ["spacy/word.pyx"], language="c++",
|
Extension("spacy.word", ["spacy/word.pyx"], language="c++",
|
||||||
include_dirs=includes),
|
include_dirs=includes),
|
||||||
|
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
|
||||||
|
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
|
||||||
|
include_dirs=includes),
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,25 +0,0 @@
|
||||||
from libc.stdint cimport uint64_t
|
|
||||||
|
|
||||||
from chartree cimport CharTree
|
|
||||||
|
|
||||||
|
|
||||||
cdef class FixedTable:
|
|
||||||
cdef size_t size
|
|
||||||
cdef uint64_t* keys
|
|
||||||
cdef size_t* values
|
|
||||||
|
|
||||||
cdef size_t insert(self, uint64_t key, size_t value) nogil
|
|
||||||
cdef size_t get(self, uint64_t key) nogil
|
|
||||||
cdef int erase(self, uint64_t key) nogil
|
|
||||||
|
|
||||||
|
|
||||||
cdef class WordTree:
|
|
||||||
cdef size_t max_length
|
|
||||||
cdef size_t default
|
|
||||||
cdef CharTree* _trees
|
|
||||||
cdef dict _dict
|
|
||||||
|
|
||||||
cdef size_t get(self, unicode string) except *
|
|
||||||
cdef int set(self, unicode string, size_t value) except *
|
|
||||||
cdef bint contains(self, unicode string) except *
|
|
||||||
|
|
|
@ -1,98 +0,0 @@
|
||||||
from libc.stdlib cimport calloc, free
|
|
||||||
import cython
|
|
||||||
|
|
||||||
cimport chartree
|
|
||||||
|
|
||||||
|
|
||||||
cdef class FixedTable:
|
|
||||||
def __cinit__(self, const size_t size):
|
|
||||||
self.size = size
|
|
||||||
self.keys = <uint64_t*>calloc(self.size, sizeof(uint64_t))
|
|
||||||
self.values = <size_t*>calloc(self.size, sizeof(size_t))
|
|
||||||
|
|
||||||
def __dealloc__(self):
|
|
||||||
free(self.keys)
|
|
||||||
free(self.values)
|
|
||||||
|
|
||||||
def __getitem__(self, uint64_t key):
|
|
||||||
return self.get(key)
|
|
||||||
|
|
||||||
def __setitem__(self, uint64_t key, size_t value):
|
|
||||||
self.insert(key, value)
|
|
||||||
|
|
||||||
def pop(self, uint64_t key):
|
|
||||||
self.delete(key)
|
|
||||||
|
|
||||||
def bucket(self, uint64_t key):
|
|
||||||
return _find(key, self.size)
|
|
||||||
|
|
||||||
cdef size_t insert(self, uint64_t key, size_t value) nogil:
|
|
||||||
cdef size_t bucket = _find(key, self.size)
|
|
||||||
cdef size_t clobbered
|
|
||||||
if self.values[bucket] == value:
|
|
||||||
clobbered = 0
|
|
||||||
else:
|
|
||||||
clobbered = self.values[bucket]
|
|
||||||
self.keys[bucket] = key
|
|
||||||
self.values[bucket] = value
|
|
||||||
return clobbered
|
|
||||||
|
|
||||||
cdef size_t get(self, uint64_t key) nogil:
|
|
||||||
cdef size_t bucket = _find(key, self.size)
|
|
||||||
if self.keys[bucket] == key:
|
|
||||||
return self.values[bucket]
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
cdef int erase(self, uint64_t key) nogil:
|
|
||||||
cdef size_t bucket = _find(key, self.size)
|
|
||||||
self.keys[bucket] = 0
|
|
||||||
self.values[bucket] = 0
|
|
||||||
|
|
||||||
|
|
||||||
@cython.cdivision
|
|
||||||
cdef inline size_t _find(uint64_t key, size_t size) nogil:
|
|
||||||
return key % size
|
|
||||||
|
|
||||||
|
|
||||||
cdef class WordTree:
|
|
||||||
def __cinit__(self, size_t default, size_t max_length):
|
|
||||||
self.max_length = max_length
|
|
||||||
self.default = default
|
|
||||||
self._trees = <CharTree*>calloc(max_length, sizeof(CharTree))
|
|
||||||
for i in range(self.max_length):
|
|
||||||
chartree.init(&self._trees[i], i)
|
|
||||||
self._dict = {}
|
|
||||||
|
|
||||||
cdef size_t get(self, unicode ustring) except *:
|
|
||||||
cdef bytes bstring = ustring.encode('utf8')
|
|
||||||
cdef size_t length = len(bstring)
|
|
||||||
if length >= self.max_length:
|
|
||||||
return self._dict.get(bstring, 0)
|
|
||||||
else:
|
|
||||||
return chartree.getitem(&self._trees[length], bstring)
|
|
||||||
|
|
||||||
cdef int set(self, unicode ustring, size_t value) except *:
|
|
||||||
cdef bytes bstring = ustring.encode('utf8')
|
|
||||||
cdef size_t length = len(bstring)
|
|
||||||
if length >= self.max_length:
|
|
||||||
self._dict[bstring] = value
|
|
||||||
else:
|
|
||||||
chartree.setitem(&self._trees[length], bstring, value)
|
|
||||||
|
|
||||||
cdef bint contains(self, unicode ustring) except *:
|
|
||||||
cdef bytes bstring = ustring.encode('utf8')
|
|
||||||
cdef size_t length = len(bstring)
|
|
||||||
if length >= self.max_length:
|
|
||||||
return bstring in self._dict
|
|
||||||
else:
|
|
||||||
return chartree.contains(&self._trees[length], bstring)
|
|
||||||
|
|
||||||
def __getitem__(self, unicode key):
|
|
||||||
return self.get(key)
|
|
||||||
|
|
||||||
def __setitem__(self, unicode key, size_t value):
|
|
||||||
self.set(key, value)
|
|
||||||
|
|
||||||
def __contains__(self, unicode key):
|
|
||||||
return self.contains(key)
|
|
36
spacy/en.pxd
36
spacy/en.pxd
|
@ -1,15 +1,38 @@
|
||||||
from libcpp.vector cimport vector
|
|
||||||
|
|
||||||
from spacy.spacy cimport StringHash
|
|
||||||
|
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
from spacy.word cimport LatinWord
|
from spacy.word cimport Lexeme
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
|
||||||
|
cpdef size_t ALPHA
|
||||||
|
cpdef size_t DIGIT
|
||||||
|
cpdef size_t PUNCT
|
||||||
|
cpdef size_t SPACE
|
||||||
|
cpdef size_t LOWER
|
||||||
|
cpdef size_t UPPER
|
||||||
|
cpdef size_t TITLE
|
||||||
|
cpdef size_t ASCII
|
||||||
|
|
||||||
|
cpdef size_t OFT_LOWER
|
||||||
|
cpdef size_t OFT_TITLE
|
||||||
|
cpdef size_t OFT_UPPER
|
||||||
|
|
||||||
|
cpdef size_t PUNCT
|
||||||
|
cpdef size_t CONJ
|
||||||
|
cpdef size_t NUM
|
||||||
|
cpdef size_t N
|
||||||
|
cpdef size_t DET
|
||||||
|
cpdef size_t ADP
|
||||||
|
cpdef size_t ADJ
|
||||||
|
cpdef size_t ADV
|
||||||
|
cpdef size_t VERB
|
||||||
|
cpdef size_t NOUN
|
||||||
|
cpdef size_t PDT
|
||||||
|
cpdef size_t POS
|
||||||
|
cpdef size_t PRON
|
||||||
|
cpdef size_t PRT
|
||||||
|
|
||||||
cdef class English(spacy.Language):
|
cdef class English(spacy.Language):
|
||||||
cdef int find_split(self, unicode word)
|
cdef int find_split(self, unicode word)
|
||||||
cdef LatinWord new_lexeme(self, unicode string)
|
|
||||||
|
|
||||||
|
|
||||||
cdef English EN
|
cdef English EN
|
||||||
|
@ -17,4 +40,3 @@ cdef English EN
|
||||||
|
|
||||||
cpdef Word lookup(unicode word)
|
cpdef Word lookup(unicode word)
|
||||||
cpdef list tokenize(unicode string)
|
cpdef list tokenize(unicode string)
|
||||||
cpdef unicode unhash(StringHash hash_value)
|
|
||||||
|
|
102
spacy/en.pyx
102
spacy/en.pyx
|
@ -43,9 +43,85 @@ from libc.stdint cimport uint64_t
|
||||||
cimport spacy
|
cimport spacy
|
||||||
|
|
||||||
|
|
||||||
|
# Python-readable flag constants --- can't read an enum from Python
|
||||||
|
|
||||||
|
# Don't want to manually assign these numbers, or we'll insert one and have to
|
||||||
|
# change them all.
|
||||||
|
# Don't use "i", as we don't want it in the global scope!
|
||||||
|
cdef size_t __i = 0
|
||||||
|
|
||||||
|
ALPHA = __i; i += 1
|
||||||
|
DIGIT = __i; __i += 1
|
||||||
|
PUNCT = __i; __i += 1
|
||||||
|
SPACE = __i; __i += 1
|
||||||
|
LOWER = __i; __i += 1
|
||||||
|
UPPER = __i; __i += 1
|
||||||
|
TITLE = __i; __i += 1
|
||||||
|
ASCII = __i; __i += 1
|
||||||
|
|
||||||
|
OFT_LOWER = __i; __i += 1
|
||||||
|
OFT_UPPER = __i; __i += 1
|
||||||
|
OFT_TITLE = __i; __i += 1
|
||||||
|
|
||||||
|
PUNCT = __i; __i += 1
|
||||||
|
CONJ = __i; __i += 1
|
||||||
|
NUM = __i; __i += 1
|
||||||
|
X = __i; __i += 1
|
||||||
|
DET = __i; __i += 1
|
||||||
|
ADP = __i; __i += 1
|
||||||
|
ADJ = __i; __i += 1
|
||||||
|
ADV = __i; __i += 1
|
||||||
|
VERB = __i; __i += 1
|
||||||
|
NOUN = __i; __i += 1
|
||||||
|
PDT = __i; __i += 1
|
||||||
|
POS = __i; __i += 1
|
||||||
|
PRON = __i; __i += 1
|
||||||
|
PRT = __i; __i += 1
|
||||||
|
|
||||||
|
|
||||||
|
# These are for the string views
|
||||||
|
__i = 0
|
||||||
|
SIC = __i; __i += 1
|
||||||
|
CANON_CASED = __i; __i += 1
|
||||||
|
NON_SPARSE = __i; __i += 1
|
||||||
|
SHAPE = __i; __i += 1
|
||||||
|
NR_STRING_VIEWS = __i
|
||||||
|
|
||||||
|
|
||||||
|
def get_string_views(unicode string, lexeme):
|
||||||
|
views = ['' for _ in range(NR_STRING_VIEWS)]
|
||||||
|
views[SIC] = string
|
||||||
|
views[CANON_CASED] = canonicalize_case(string, lexeme)
|
||||||
|
views[SHAPE] = get_string_shape(string)
|
||||||
|
views[NON_SPARSE] = get_non_sparse(string, views[CANON_CASED], views[SHAPE],
|
||||||
|
lexeme)
|
||||||
|
return views
|
||||||
|
|
||||||
|
|
||||||
|
def set_orth_flags(unicode string, flags_t flags)
|
||||||
|
setters = [
|
||||||
|
(ALPHA, is_alpha),
|
||||||
|
(DIGIT, is_digit),
|
||||||
|
(PUNCT, is_punct),
|
||||||
|
(SPACE, is_space),
|
||||||
|
(LOWER, is_lower),
|
||||||
|
(UPPER, is_upper),
|
||||||
|
(SPACE, is_space)
|
||||||
|
]
|
||||||
|
|
||||||
|
for bit, setter in setters:
|
||||||
|
if setter(string):
|
||||||
|
flags |= 1 << bit
|
||||||
|
return flags
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class English(spacy.Language):
|
cdef class English(spacy.Language):
|
||||||
cdef LatinWord new_lexeme(self, unicode string):
|
cdef Lexeme new_lexeme(self, unicode string, cluster=0, prob=0, case_stats=None,
|
||||||
return LatinWord(string)
|
tag_freqs=None):
|
||||||
|
return Lexeme(s, length, views, prob=prob, cluster=cluster,
|
||||||
|
flags=self.get_flags(string))
|
||||||
|
|
||||||
cdef int find_split(self, unicode word):
|
cdef int find_split(self, unicode word):
|
||||||
cdef size_t length = len(word)
|
cdef size_t length = len(word)
|
||||||
|
@ -101,7 +177,7 @@ cpdef list tokenize(unicode string):
|
||||||
return EN.tokenize(string)
|
return EN.tokenize(string)
|
||||||
|
|
||||||
|
|
||||||
cpdef Word lookup(unicode string):
|
cpdef Lexeme lookup(unicode string):
|
||||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
|
"""Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
|
||||||
|
|
||||||
Properties of the Lexeme are accessed by passing LexID to the accessor methods.
|
Properties of the Lexeme are accessed by passing LexID to the accessor methods.
|
||||||
|
@ -116,23 +192,6 @@ cpdef Word lookup(unicode string):
|
||||||
return EN.lookup(string)
|
return EN.lookup(string)
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode unhash(StringHash hash_value):
|
|
||||||
"""Retrieve a string from a hash value. Mostly used for testing.
|
|
||||||
|
|
||||||
In general you should avoid computing with strings, as they are slower than
|
|
||||||
the intended ID-based usage. However, strings can be recovered if necessary,
|
|
||||||
although no control is taken for hash collisions.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
hash_value (StringHash): The hash of a string, returned by Python's hash()
|
|
||||||
function.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
string (unicode): A unicode string that hashes to the hash_value.
|
|
||||||
"""
|
|
||||||
return EN.unhash(hash_value)
|
|
||||||
|
|
||||||
|
|
||||||
def add_string_views(view_funcs):
|
def add_string_views(view_funcs):
|
||||||
"""Add a string view to existing and previous lexical entries.
|
"""Add a string view to existing and previous lexical entries.
|
||||||
|
|
||||||
|
@ -150,16 +209,19 @@ def load_clusters(location):
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def load_unigram_probs(location):
|
def load_unigram_probs(location):
|
||||||
"""Load unigram probabilities.
|
"""Load unigram probabilities.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def load_case_stats(location):
|
def load_case_stats(location):
|
||||||
"""Load case stats.
|
"""Load case stats.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def load_tag_stats(location):
|
def load_tag_stats(location):
|
||||||
"""Load tag statistics.
|
"""Load tag statistics.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,16 +1,12 @@
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
from spacy.word cimport Word
|
from spacy.word cimport Lexeme
|
||||||
|
|
||||||
ctypedef uint32_t StringHash
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef object name
|
cdef object name
|
||||||
cdef dict chunks
|
cdef dict blobs
|
||||||
cdef dict vocab
|
cdef dict lexicon
|
||||||
cdef dict bacov
|
|
||||||
|
|
||||||
cpdef list tokenize(self, unicode text)
|
cpdef list tokenize(self, unicode text)
|
||||||
|
|
||||||
|
@ -20,8 +16,5 @@ cdef class Language:
|
||||||
cdef list new_chunk(self, unicode string, list substrings)
|
cdef list new_chunk(self, unicode string, list substrings)
|
||||||
cdef Word new_lexeme(self, unicode lex)
|
cdef Word new_lexeme(self, unicode lex)
|
||||||
|
|
||||||
cpdef unicode unhash(self, StringHash hashed)
|
|
||||||
|
|
||||||
cpdef list find_substrings(self, unicode chunk)
|
cpdef list find_substrings(self, unicode chunk)
|
||||||
cdef int find_split(self, unicode word)
|
cdef int find_split(self, unicode word)
|
||||||
cdef int set_orth(self, unicode string, Word word)
|
|
|
@ -15,16 +15,13 @@ from libc.stdlib cimport calloc, free
|
||||||
from . import util
|
from . import util
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
TAGS = {}
|
|
||||||
DIST_FLAGS = {}
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
view_funcs = []
|
view_funcs = []
|
||||||
def __cinit__(self, name):
|
def __cinit__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.bacov = {}
|
self.blobs = {}
|
||||||
self.chunks = {}
|
self.lexicon = {}
|
||||||
self.vocab = {}
|
|
||||||
self.load_tokenization(util.read_tokenization(name))
|
self.load_tokenization(util.read_tokenization(name))
|
||||||
self.load_dist_info(util.read_dist_info(name))
|
self.load_dist_info(util.read_dist_info(name))
|
||||||
|
|
||||||
|
@ -37,26 +34,26 @@ cdef class Language:
|
||||||
string (unicode): The string to split.
|
string (unicode): The string to split.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tokens (Tokens): A Tokens object.
|
tokens (list): A list of Lexeme objects.
|
||||||
"""
|
"""
|
||||||
cdef list chunk
|
cdef list blob
|
||||||
cdef list tokens = []
|
cdef list tokens = []
|
||||||
cdef size_t length = len(string)
|
cdef size_t length = len(string)
|
||||||
cdef size_t start = 0
|
cdef size_t start = 0
|
||||||
cdef size_t i = 0
|
cdef size_t i = 0
|
||||||
for c in string:
|
for c in string:
|
||||||
if _is_whitespace(c):
|
if c == ' ':
|
||||||
if start < i:
|
if start < i:
|
||||||
chunk = self.lookup_chunk(string[start:i])
|
blob = self.lookup_blob(string[start:i])
|
||||||
tokens.extend(chunk)
|
tokens.extend(blob)
|
||||||
start = i + 1
|
start = i + 1
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
chunk = self.lookup_chunk(string[start:])
|
chunk = self.lookup_blob(string[start:])
|
||||||
tokens.extend(chunk)
|
tokens.extend(chunk)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef Word lookup(self, unicode string):
|
cdef Lexeme lookup(self, unicode string):
|
||||||
assert len(string) != 0
|
assert len(string) != 0
|
||||||
cdef Word word
|
cdef Word word
|
||||||
if string in self.vocab:
|
if string in self.vocab:
|
||||||
|
@ -65,28 +62,26 @@ cdef class Language:
|
||||||
word = self.new_lexeme(string)
|
word = self.new_lexeme(string)
|
||||||
return word
|
return word
|
||||||
|
|
||||||
cdef list lookup_chunk(self, unicode string):
|
cdef list lookup_blob(self, unicode string):
|
||||||
cdef list chunk
|
cdef list chunk
|
||||||
cdef size_t chunk_id
|
cdef size_t blob_id
|
||||||
if string in self.chunks:
|
if string in self.blobs:
|
||||||
chunk = self.chunks[string]
|
blob = self.blobs[string]
|
||||||
else:
|
else:
|
||||||
chunk = self.new_chunk(string, self.find_substrings(string))
|
blob = self.new_blob(string, self.find_substrings(string))
|
||||||
return chunk
|
return chunk
|
||||||
|
|
||||||
cdef list new_chunk(self, unicode string, list substrings):
|
cdef list new_blob(self, unicode string, list substrings):
|
||||||
chunk = []
|
blob = []
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
chunk.append(self.lookup(substring))
|
blob.append(self.lookup(substring))
|
||||||
self.chunks[string] = chunk
|
self.blobs[string] = chunk
|
||||||
return chunk
|
return blob
|
||||||
|
|
||||||
cdef Word new_lexeme(self, unicode string):
|
cdef Word new_lexeme(self, unicode string):
|
||||||
string_views = [view_func(string) for view_func in self.view_funcs]
|
# TODO
|
||||||
word = Word(string.encode('utf8'), string_views)
|
#lexeme = Lexeme(string.encode('utf8'), string_views)
|
||||||
self.bacov[word.lex] = string
|
#return lexeme
|
||||||
self.vocab[string] = word
|
|
||||||
return word
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def add_view_funcs(self, list view_funcs):
|
def add_view_funcs(self, list view_funcs):
|
||||||
|
@ -112,11 +107,7 @@ cdef class Language:
|
||||||
self.bacov[hashed] = view
|
self.bacov[hashed] = view
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cpdef unicode unhash(self, StringHash hash_value):
|
cpdef list find_substrings(self, unicode blob):
|
||||||
'''Fetch a string from the reverse index, given its hash value.'''
|
|
||||||
return self.bacov[hash_value]
|
|
||||||
|
|
||||||
cpdef list find_substrings(self, unicode chunk):
|
|
||||||
"""Find how to split a chunk into substrings.
|
"""Find how to split a chunk into substrings.
|
||||||
|
|
||||||
This method calls find_split repeatedly. Most languages will want to
|
This method calls find_split repeatedly. Most languages will want to
|
||||||
|
@ -129,21 +120,18 @@ cdef class Language:
|
||||||
substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
|
substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
|
||||||
"""
|
"""
|
||||||
substrings = []
|
substrings = []
|
||||||
while chunk:
|
while blob:
|
||||||
split = self.find_split(chunk)
|
split = self.find_split(blob)
|
||||||
if split == 0:
|
if split == 0:
|
||||||
substrings.append(chunk)
|
substrings.append(blob)
|
||||||
break
|
break
|
||||||
substrings.append(chunk[:split])
|
substrings.append(blob[:split])
|
||||||
chunk = chunk[split:]
|
blob = blob[split:]
|
||||||
return substrings
|
return substrings
|
||||||
|
|
||||||
cdef int find_split(self, unicode word):
|
cdef int find_split(self, unicode word):
|
||||||
return len(word)
|
return len(word)
|
||||||
|
|
||||||
cdef int set_orth(self, unicode string, Word word):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def load_tokenization(self, token_rules):
|
def load_tokenization(self, token_rules):
|
||||||
'''Load special-case tokenization rules.
|
'''Load special-case tokenization rules.
|
||||||
|
|
||||||
|
@ -178,22 +166,3 @@ cdef class Language:
|
||||||
w.dist_flags |= DIST_FLAGS[flag]
|
w.dist_flags |= DIST_FLAGS[flag]
|
||||||
for tag in word_dist.tagdict:
|
for tag in word_dist.tagdict:
|
||||||
w.possible_tags |= TAGS[tag]
|
w.possible_tags |= TAGS[tag]
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
|
||||||
if c == ' ':
|
|
||||||
return True
|
|
||||||
elif c == '\n':
|
|
||||||
return True
|
|
||||||
elif c == '\t':
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
#cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:
|
|
||||||
# cdef size_t i = 0
|
|
||||||
# while chunk[i] != NULL:
|
|
||||||
# tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])
|
|
||||||
# tokens.length += 1
|
|
||||||
# i += 1
|
|
|
@ -1,32 +0,0 @@
|
||||||
cdef enum OrthFlag:
|
|
||||||
IS_ALPHA
|
|
||||||
IS_DIGIT
|
|
||||||
IS_PUNCT
|
|
||||||
IS_SPACE
|
|
||||||
IS_LOWER
|
|
||||||
IS_UPPER
|
|
||||||
IS_TITLE
|
|
||||||
IS_ASCII
|
|
||||||
|
|
||||||
|
|
||||||
cdef enum:
|
|
||||||
NORM
|
|
||||||
SHAPE
|
|
||||||
LAST3
|
|
||||||
|
|
||||||
from spacy.lexeme cimport LexID
|
|
||||||
from spacy.lexeme cimport StringHash
|
|
||||||
|
|
||||||
cpdef bint is_alpha(LexID lex_id) except *
|
|
||||||
cpdef bint is_digit(LexID lex_id) except *
|
|
||||||
cpdef bint is_punct(LexID lex_id) except *
|
|
||||||
cpdef bint is_space(LexID lex_id) except *
|
|
||||||
cpdef bint is_lower(LexID lex_id) except *
|
|
||||||
cpdef bint is_upper(LexID lex_id) except *
|
|
||||||
cpdef bint is_title(LexID lex_id) except *
|
|
||||||
cpdef bint is_ascii(LexID lex_id) except *
|
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash norm_of(LexID lex_id) except 0
|
|
||||||
cpdef StringHash shape_of(LexID lex_id) except 0
|
|
||||||
cpdef StringHash last3_of(LexID lex_id) except 0
|
|
|
@ -1,211 +0,0 @@
|
||||||
# cython: embedsignature=True
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
|
||||||
|
|
||||||
def get_normalized(unicode word):
|
|
||||||
"""Todo.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
word (unicode)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
normalized (unicode)
|
|
||||||
"""
|
|
||||||
if word.isalpha() and word.islower():
|
|
||||||
return word
|
|
||||||
else:
|
|
||||||
return get_word_shape(word)
|
|
||||||
|
|
||||||
|
|
||||||
def get_word_shape(unicode word):
|
|
||||||
"""Todo.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
word (unicode)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
shape (unicode)
|
|
||||||
"""
|
|
||||||
cdef size_t length = len(word)
|
|
||||||
shape = ""
|
|
||||||
last = ""
|
|
||||||
shape_char = ""
|
|
||||||
seq = 0
|
|
||||||
for c in word:
|
|
||||||
if c.isalpha():
|
|
||||||
if c.isupper():
|
|
||||||
shape_char = "X"
|
|
||||||
else:
|
|
||||||
shape_char = "x"
|
|
||||||
elif c.isdigit():
|
|
||||||
shape_char = "d"
|
|
||||||
else:
|
|
||||||
shape_char = c
|
|
||||||
if shape_char == last:
|
|
||||||
seq += 1
|
|
||||||
else:
|
|
||||||
seq = 0
|
|
||||||
last = shape_char
|
|
||||||
if seq < 3:
|
|
||||||
shape += shape_char
|
|
||||||
assert shape
|
|
||||||
return shape
|
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode get_last3(unicode string):
|
|
||||||
return string[-3:]
|
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_alpha(LexID lex_id) except *:
|
|
||||||
"""Check whether all characters in the word's string are alphabetic.
|
|
||||||
|
|
||||||
Should match the :py:func:`unicode.isalpha()` function.
|
|
||||||
|
|
||||||
>>> is_alpha(lookup(u'Hello'))
|
|
||||||
True
|
|
||||||
>>> is_alpha(lookup(u'العرب'))
|
|
||||||
True
|
|
||||||
>>> is_alpha(lookup(u'10'))
|
|
||||||
False
|
|
||||||
"""
|
|
||||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ALPHA
|
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_digit(LexID lex_id) except *:
|
|
||||||
"""Check whether all characters in the word's string are numeric.
|
|
||||||
|
|
||||||
Should match the :py:func:`unicode.isdigit()` function.
|
|
||||||
|
|
||||||
>>> is_digit(lookup(u'10'))
|
|
||||||
True
|
|
||||||
>>> is_digit(lookup(u'๐'))
|
|
||||||
True
|
|
||||||
>>> is_digit(lookup(u'one'))
|
|
||||||
False
|
|
||||||
"""
|
|
||||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_DIGIT
|
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_punct(LexID lex_id) except *:
|
|
||||||
"""Check whether all characters belong to a punctuation unicode data category
|
|
||||||
for a Lexeme ID.
|
|
||||||
|
|
||||||
>>> is_punct(lookup(u'.'))
|
|
||||||
True
|
|
||||||
>>> is_punct(lookup(u'⁒'))
|
|
||||||
True
|
|
||||||
>>> is_punct(lookup(u' '))
|
|
||||||
False
|
|
||||||
"""
|
|
||||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_PUNCT
|
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_space(LexID lex_id) except *:
|
|
||||||
"""Give the result of unicode.isspace() for a Lexeme ID.
|
|
||||||
|
|
||||||
>>> is_space(lookup(u'\\t'))
|
|
||||||
True
|
|
||||||
>>> is_space(lookup(u'<unicode space>'))
|
|
||||||
True
|
|
||||||
>>> is_space(lookup(u'Hi\\n'))
|
|
||||||
False
|
|
||||||
"""
|
|
||||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
|
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_lower(LexID lex_id) except *:
|
|
||||||
"""Give the result of unicode.islower() for a Lexeme ID.
|
|
||||||
|
|
||||||
>>> is_lower(lookup(u'hi'))
|
|
||||||
True
|
|
||||||
>>> is_lower(lookup(<unicode>))
|
|
||||||
True
|
|
||||||
>>> is_lower(lookup(u'10'))
|
|
||||||
False
|
|
||||||
"""
|
|
||||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_LOWER
|
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_upper(LexID lex_id) except *:
|
|
||||||
"""Give the result of unicode.isupper() for a Lexeme ID.
|
|
||||||
|
|
||||||
>>> is_upper(lookup(u'HI'))
|
|
||||||
True
|
|
||||||
>>> is_upper(lookup(u'H10'))
|
|
||||||
True
|
|
||||||
>>> is_upper(lookup(u'10'))
|
|
||||||
False
|
|
||||||
"""
|
|
||||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_UPPER
|
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_title(LexID lex_id) except *:
|
|
||||||
"""Give the result of unicode.istitle() for a Lexeme ID.
|
|
||||||
|
|
||||||
>>> is_title(lookup(u'Hi'))
|
|
||||||
True
|
|
||||||
>>> is_title(lookup(u'Hi1'))
|
|
||||||
True
|
|
||||||
>>> is_title(lookup(u'1'))
|
|
||||||
False
|
|
||||||
"""
|
|
||||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_TITLE
|
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_ascii(LexID lex_id) except *:
|
|
||||||
"""Give the result of checking whether all characters in the string are ascii.
|
|
||||||
|
|
||||||
>>> is_ascii(lookup(u'Hi'))
|
|
||||||
True
|
|
||||||
>>> is_ascii(lookup(u' '))
|
|
||||||
True
|
|
||||||
>>> is_title(lookup(u'<unicode>'))
|
|
||||||
False
|
|
||||||
"""
|
|
||||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ASCII
|
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash norm_of(LexID lex_id) except 0:
|
|
||||||
"""Return the hash of a "normalized" version of the string.
|
|
||||||
|
|
||||||
Normalized strings are intended to be less sparse, while still capturing
|
|
||||||
important lexical information. See :py:func:`spacy.latin.orthography.normalize_string`
|
|
||||||
for details of the normalization function.
|
|
||||||
|
|
||||||
>>> unhash(norm_of(lookupu'Hi'))
|
|
||||||
u'hi'
|
|
||||||
>>> unhash(norm_of(lookup(u'255667')))
|
|
||||||
u'shape=dddd'
|
|
||||||
>>> unhash(norm_of(lookup(u'...')))
|
|
||||||
u'...'
|
|
||||||
"""
|
|
||||||
return (<Lexeme*>lex_id).string_views[NORM]
|
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash shape_of(LexID lex_id) except 0:
|
|
||||||
"""Return the hash of a string describing the word's "orthograpgic shape".
|
|
||||||
|
|
||||||
Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
|
|
||||||
function. Word shape features have been found useful for NER and POS tagging,
|
|
||||||
e.g. Manning (2011)
|
|
||||||
|
|
||||||
>>> unhash(shape_of(lookupu'Hi'))
|
|
||||||
u'Xx'
|
|
||||||
>>> unhash(shape_of(lookup(u'255667')))
|
|
||||||
u'dddd'
|
|
||||||
>>> unhash(shape_of(lookup(u'...')))
|
|
||||||
u'...'
|
|
||||||
"""
|
|
||||||
cdef Lexeme* w = <Lexeme*>lex_id
|
|
||||||
return w.string_views[SHAPE]
|
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash last3_of(LexID lex_id) except 0:
|
|
||||||
'''Return the hash of string[-3:], i.e. the last three characters of the word.
|
|
||||||
|
|
||||||
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
|
|
||||||
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
|
||||||
[u'llo', u'!']
|
|
||||||
'''
|
|
||||||
return (<Lexeme*>lex_id).string_views[LAST3]
|
|
|
@ -1,7 +0,0 @@
|
||||||
cpdef bytes to_bytes(unicode string)
|
|
||||||
|
|
||||||
cpdef unicode from_bytes(bytes string)
|
|
||||||
|
|
||||||
cpdef unicode substr(unicode string, int start, int end, size_t length)
|
|
||||||
|
|
||||||
cdef bint is_whitespace(Py_UNICODE c)
|
|
|
@ -1,35 +0,0 @@
|
||||||
# cython: profile=True
|
|
||||||
|
|
||||||
cpdef bytes to_bytes(unicode string):
|
|
||||||
return string.encode('utf8')
|
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode from_bytes(bytes string):
|
|
||||||
return string.decode('utf8')
|
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode substr(unicode string, int start, int end, size_t length):
|
|
||||||
if end >= length:
|
|
||||||
end = -1
|
|
||||||
if start >= length:
|
|
||||||
start = 0
|
|
||||||
if start <= 0 and end < 0:
|
|
||||||
return string
|
|
||||||
elif start < 0:
|
|
||||||
start = 0
|
|
||||||
elif end < 0:
|
|
||||||
end = length
|
|
||||||
return string[start:end]
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint is_whitespace(Py_UNICODE c):
|
|
||||||
# TODO: Support other unicode spaces
|
|
||||||
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
|
|
||||||
if c == u' ':
|
|
||||||
return True
|
|
||||||
elif c == u'\n':
|
|
||||||
return True
|
|
||||||
elif c == u'\t':
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
|
@ -1,18 +0,0 @@
|
||||||
from libcpp.vector cimport vector
|
|
||||||
from spacy.lexeme cimport LexID
|
|
||||||
from spacy.lexeme cimport Lexeme
|
|
||||||
|
|
||||||
from cython.operator cimport dereference as deref
|
|
||||||
from spacy.spacy cimport Language
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
|
||||||
cdef Language lang
|
|
||||||
cdef vector[LexID]* vctr
|
|
||||||
cdef size_t length
|
|
||||||
|
|
||||||
cpdef int append(self, LexID token)
|
|
||||||
cpdef int extend(self, Tokens other) except -1
|
|
||||||
|
|
||||||
cpdef object group_by(self, size_t attr)
|
|
||||||
cpdef dict count_by(self, size_t attr)
|
|
|
@ -1,92 +0,0 @@
|
||||||
from cython.operator cimport dereference as deref
|
|
||||||
from cython.operator cimport preincrement as inc
|
|
||||||
|
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
|
||||||
from spacy.spacy cimport StringHash
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
|
||||||
def __cinit__(self, Language lang):
|
|
||||||
self.lang = lang
|
|
||||||
self.vctr = new vector[LexID]()
|
|
||||||
self.length = 0
|
|
||||||
|
|
||||||
def __dealloc__(self):
|
|
||||||
del self.vctr
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
cdef vector[LexID].iterator it = self.vctr[0].begin()
|
|
||||||
while it != self.vctr[0].end():
|
|
||||||
yield deref(it)
|
|
||||||
inc(it)
|
|
||||||
|
|
||||||
def __getitem__(self, size_t idx):
|
|
||||||
return self.vctr[0].at(idx)
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return self.length
|
|
||||||
|
|
||||||
cpdef int append(self, LexID token):
|
|
||||||
self.vctr[0].push_back(token)
|
|
||||||
self.length += 1
|
|
||||||
|
|
||||||
cpdef int extend(self, Tokens other) except -1:
|
|
||||||
cdef LexID el
|
|
||||||
for el in other:
|
|
||||||
self.append(el)
|
|
||||||
|
|
||||||
cpdef object group_by(self, size_t view_idx):
|
|
||||||
'''Group tokens that share the property attr into Tokens instances, and
|
|
||||||
return a list of them. Returns a tuple of three lists:
|
|
||||||
|
|
||||||
(string names, hashes, tokens)
|
|
||||||
|
|
||||||
The lists are aligned, so the ith entry in string names is the string
|
|
||||||
that the ith entry in hashes unhashes to, which the Tokens instance
|
|
||||||
is grouped by.
|
|
||||||
|
|
||||||
You can then use count_by or group_by on the Tokens
|
|
||||||
for further processing. Calling group_by and then asking the length
|
|
||||||
of the Tokens objects is equivalent to count_by, but somewhat slower.
|
|
||||||
'''
|
|
||||||
# Implementation here is working around some of the constraints in
|
|
||||||
# Cython about what type of thing can go in what type of container.
|
|
||||||
# Long story short, it's pretty hard to get a Python object like
|
|
||||||
# Tokens into a vector or array. If we really need this to run faster,
|
|
||||||
# we can be tricky and get the Python list access out of the loop. What
|
|
||||||
# we'd do is store pointers to the underlying vectors.
|
|
||||||
# So far, speed isn't mattering here.
|
|
||||||
cdef dict indices = {}
|
|
||||||
cdef list groups = []
|
|
||||||
cdef list names = []
|
|
||||||
cdef list hashes = []
|
|
||||||
|
|
||||||
cdef StringHash key
|
|
||||||
cdef LexID t
|
|
||||||
for t in self.vctr[0]:
|
|
||||||
if view_idx == 0:
|
|
||||||
key = (<Lexeme*>t).lex
|
|
||||||
else:
|
|
||||||
key = (<Lexeme*>t).string_views[view_idx - 1]
|
|
||||||
if key in indices:
|
|
||||||
groups[indices[key]].append(t)
|
|
||||||
else:
|
|
||||||
indices[key] = len(groups)
|
|
||||||
groups.append(Tokens(self.lang))
|
|
||||||
names.append(self.lang.unhash(key))
|
|
||||||
hashes.append(key)
|
|
||||||
groups[-1].append(t)
|
|
||||||
return names, hashes, groups
|
|
||||||
|
|
||||||
cpdef dict count_by(self, size_t attr):
|
|
||||||
counts = {}
|
|
||||||
cdef LexID t
|
|
||||||
cdef StringHash key
|
|
||||||
for t in self.vctr[0]:
|
|
||||||
#key = attr_of(t, attr)
|
|
||||||
key = 0
|
|
||||||
if key not in counts:
|
|
||||||
counts[key] = 0
|
|
||||||
counts[key] += 1
|
|
||||||
return counts
|
|
|
@ -1,59 +1,25 @@
|
||||||
from libc.stdint cimport uint32_t
|
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
||||||
from libc.stdint cimport uint64_t
|
|
||||||
|
|
||||||
ctypedef int ClusterID
|
|
||||||
ctypedef uint32_t StringHash
|
|
||||||
ctypedef size_t LexID
|
|
||||||
ctypedef char OrthFlags
|
|
||||||
ctypedef char DistFlags
|
|
||||||
ctypedef uint64_t TagFlags
|
|
||||||
|
|
||||||
|
|
||||||
cdef enum OrthFlag:
|
DEF MAX_FLAG = 64
|
||||||
IS_ALPHA
|
|
||||||
IS_DIGIT
|
|
||||||
IS_PUNCT
|
|
||||||
IS_SPACE
|
|
||||||
IS_LOWER
|
|
||||||
IS_UPPER
|
|
||||||
IS_TITLE
|
|
||||||
IS_ASCII
|
|
||||||
|
|
||||||
|
|
||||||
cdef enum:
|
cdef class Lexeme:
|
||||||
NORM
|
|
||||||
SHAPE
|
|
||||||
LAST3
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Word:
|
|
||||||
# NB: the readonly keyword refers to _Python_ access. The attributes are
|
# NB: the readonly keyword refers to _Python_ access. The attributes are
|
||||||
# writeable from Cython.
|
# writeable from Cython.
|
||||||
cdef readonly StringHash key
|
cdef readonly id_t id
|
||||||
cdef readonly char** utf8_strings
|
|
||||||
cdef readonly size_t length
|
cdef readonly size_t length
|
||||||
cdef readonly double prob
|
cdef readonly double prob
|
||||||
cdef readonly ClusterID cluster
|
cdef readonly size_t cluster
|
||||||
cdef readonly TagFlags possible_tags
|
|
||||||
cdef readonly DistFlags dist_flags
|
|
||||||
cdef readonly OrthFlags orth_flags
|
|
||||||
|
|
||||||
cpdef StringHash get_view(self, size_t i) except 0
|
cdef readonly utf8_t* strings
|
||||||
|
cdef readonly size_t nr_strings
|
||||||
|
|
||||||
|
cdef readonly flag_t flags
|
||||||
|
|
||||||
cdef class CasedWord(Word):
|
cpdef bint check_flag(self, size_t flag_id) except *
|
||||||
cpdef bint can_tag(self, TagFlags flag) except *
|
cpdef int set_flag(self, size_t flag_id) except -1
|
||||||
cpdef bint check_dist_flag(self, DistFlags flag) except *
|
|
||||||
cpdef bint check_orth_flag(self, OrthFlags flag) except *
|
cpdef unicode get_string(self, size_t i) except *
|
||||||
|
cpdef id_t get_id(self, size_t i) except 0
|
||||||
cpdef bint is_often_titled(self) except *
|
cpdef int add_strings(self, list strings) except -1
|
||||||
cpdef bint is_often_uppered(self) except *
|
|
||||||
|
|
||||||
cpdef bint is_alpha(self) except *
|
|
||||||
cpdef bint is_digit(self) except *
|
|
||||||
cpdef bint is_punct(self) except *
|
|
||||||
cpdef bint is_space(self) except *
|
|
||||||
cpdef bint is_lower(self) except *
|
|
||||||
cpdef bint is_upper(self) except *
|
|
||||||
cpdef bint is_title(self) except *
|
|
||||||
cpdef bint is_ascii(self) except *
|
|
||||||
|
|
394
spacy/word.pyx
394
spacy/word.pyx
|
@ -4,40 +4,32 @@
|
||||||
|
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
|
|
||||||
|
from spacy cimport flags
|
||||||
# Python-visible enum for POS tags
|
|
||||||
PUNCT = 0
|
|
||||||
CONJ = 1
|
|
||||||
NUM = 2
|
|
||||||
X = 3
|
|
||||||
DET = 4
|
|
||||||
ADP = 5
|
|
||||||
ADJ = 6
|
|
||||||
ADV = 7
|
|
||||||
VERB = 8
|
|
||||||
NOUN = 9
|
|
||||||
PDT = 10
|
|
||||||
POS = 11
|
|
||||||
PRON = 12
|
|
||||||
PRT = 13
|
|
||||||
|
|
||||||
|
|
||||||
DEF OFT_UPPER = 1
|
cdef class Lexeme:
|
||||||
DEF OFT_TITLE = 2
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Word:
|
|
||||||
"""A lexical type.
|
"""A lexical type.
|
||||||
|
|
||||||
|
Clients should avoid instantiating Lexemes directly, and instead use get_lexeme
|
||||||
|
from a language module, e.g. spacy.en.get_lexeme . This allows us to use only
|
||||||
|
one Lexeme object per lexical type.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
string (bytes):
|
id (view_id_t):
|
||||||
A utf8-encoded byte-string for the word.
|
A unique ID of the word's string.
|
||||||
|
|
||||||
lex (StringHash):
|
Implemented as the memory-address of the string,
|
||||||
A hash of the word.
|
as we use Python's string interning to guarantee that only one copy
|
||||||
|
of each string is seen.
|
||||||
|
|
||||||
|
string (unicode):
|
||||||
|
The unicode string.
|
||||||
|
|
||||||
|
Implemented as a property; relatively expensive.
|
||||||
|
|
||||||
length (size_t):
|
length (size_t):
|
||||||
The (unicode) length of the word.
|
The number of unicode code-points in the string.
|
||||||
|
|
||||||
prob (double):
|
prob (double):
|
||||||
An estimate of the word's unigram log probability.
|
An estimate of the word's unigram log probability.
|
||||||
|
|
||||||
|
@ -60,186 +52,194 @@ cdef class Word:
|
||||||
while "dapple" is totally different. On the other hand, "scalable" receives
|
while "dapple" is totally different. On the other hand, "scalable" receives
|
||||||
the same cluster ID as "pineapple", which is not what we'd like.
|
the same cluster ID as "pineapple", which is not what we'd like.
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, bytes string, list string_views, prob=0.0, cluster=0,
|
def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0,
|
||||||
orth_flags=0, dist_flags=0, possible_tags=0):
|
cluster=0, orth_flags=0, dist_flags=0, possible_tags=0):
|
||||||
self.string = <char*>string
|
self.id = <id_t>&string
|
||||||
self.length = len(string)
|
self.length = length
|
||||||
self.views = <char**>calloc(len(string_views), sizeof(StringHash))
|
self.nr_strings = 0
|
||||||
cdef unicode view
|
self.add_views(views)
|
||||||
for i in range(len(string_views)):
|
|
||||||
view = string_views[i]
|
|
||||||
self.string_views[i] = hash(view)
|
|
||||||
|
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
free(self.string_views)
|
free(self.views)
|
||||||
|
|
||||||
cpdef StringHash get_view(self, size_t i) except 0:
|
property string:
|
||||||
return self.string_views[i]
|
def __get__(self):
|
||||||
|
return self.strings[0].decode('utf8')
|
||||||
|
|
||||||
cpdef bint check_orth_flag(self, OrthFlags flag) except *:
|
cpdef unicode get_view_string(self, size_t i) except *:
|
||||||
"""Access the value of one of the pre-computed boolean orthographic features.
|
assert i < self.nr_strings
|
||||||
|
return self.strings[i].decode('utf8')
|
||||||
|
|
||||||
Meanings depend on the language-specific orthographic features being loaded.
|
cpdef intptr_t get_view_id(self, size_t i) except 0:
|
||||||
The suggested features for latin-alphabet languages are: TODO
|
assert i < self.nr_strings
|
||||||
"""
|
return <string_id_t>&self.views[i]
|
||||||
return self.orth_flags & (1 << flag)
|
|
||||||
|
|
||||||
cpdef bint check_dist_flag(self, DistFlags flag) except *:
|
cpdef int add_views(self, list views) except -1:
|
||||||
|
self.nr_views += len(strings)
|
||||||
|
self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t))
|
||||||
|
cdef unicode view
|
||||||
|
cdef bytes utf8_string
|
||||||
|
for i, view in enumerate(strings):
|
||||||
|
view = string_views[i]
|
||||||
|
utf8_string = view.encode('utf8')
|
||||||
|
# Intern strings, allowing pointer comparison
|
||||||
|
utf8_string = intern(utf8_string)
|
||||||
|
self.views[i] = utf8_string
|
||||||
|
|
||||||
|
cpdef bint check_flag(self, size_t flag_id) except *:
|
||||||
"""Access the value of one of the pre-computed boolean distribution features.
|
"""Access the value of one of the pre-computed boolean distribution features.
|
||||||
|
|
||||||
Meanings depend on the language-specific distributional features being loaded.
|
Meanings depend on the language-specific distributional features being loaded.
|
||||||
The suggested features for latin-alphabet languages are: TODO
|
The suggested features for latin-alphabet languages are: TODO
|
||||||
"""
|
"""
|
||||||
|
assert flag_id < flags.MAX_FLAG
|
||||||
return self.dist_flags & (1 << flag)
|
return self.flags & (1 << flag_id)
|
||||||
|
|
||||||
cpdef bint can_tag(self, TagFlags flag) except *:
|
cpdef int set_flag(self, size_t flag_id) except -1:
|
||||||
"""Check whether the word often receives a particular tag in a large text
|
assert flag_id < flags.MAX_FLAG
|
||||||
corpus. "Often" is chosen by heuristic.
|
self.flags |= (1 << flag_id)
|
||||||
"""
|
|
||||||
return self.possible_tags & (1 << flag)
|
|
||||||
|
|
||||||
|
|
||||||
cdef class CasedWord(Word):
|
#
|
||||||
def __cinit__(self, bytes string):
|
#cdef class CasedWord(Word):
|
||||||
string_views = [get_normaized(string), get_word_shape(string), string[-3:]]
|
# def __cinit__(self, bytes string, list views):
|
||||||
Word.__cinit__(self, string, string_views)
|
# Word.__cinit__(self, string, string_views)
|
||||||
|
#
|
||||||
cpdef bint is_often_uppered(self) except *:
|
# cpdef bint is_often_uppered(self) except *:
|
||||||
'''Check the OFT_UPPER distributional flag for the word.
|
# '''Check the OFT_UPPER distributional flag for the word.
|
||||||
|
#
|
||||||
The OFT_UPPER flag records whether a lower-cased version of the word
|
# The OFT_UPPER flag records whether a lower-cased version of the word
|
||||||
is found in all-upper case frequently in a large sample of text, where
|
# is found in all-upper case frequently in a large sample of text, where
|
||||||
"frequently" is defined as P >= 0.95 (chosen for high mutual information for
|
# "frequently" is defined as P >= 0.95 (chosen for high mutual information for
|
||||||
POS tagging).
|
# POS tagging).
|
||||||
|
#
|
||||||
Case statistics are estimated from a large text corpus. Estimates are read
|
# Case statistics are estimated from a large text corpus. Estimates are read
|
||||||
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
|
# from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
|
||||||
|
#
|
||||||
>>> is_often_uppered(lookup(u'nato'))
|
# >>> is_often_uppered(lookup(u'nato'))
|
||||||
True
|
# True
|
||||||
>>> is_often_uppered(lookup(u'the'))
|
# >>> is_often_uppered(lookup(u'the'))
|
||||||
False
|
# False
|
||||||
'''
|
# '''
|
||||||
return self.dist_flags & (1 << OFT_UPPER)
|
# return self.dist_flags & (1 << OFT_UPPER)
|
||||||
|
#
|
||||||
|
#
|
||||||
cpdef bint is_often_titled(self) except *:
|
# cpdef bint is_often_titled(self) except *:
|
||||||
'''Check the OFT_TITLE distributional flag for the word.
|
# '''Check the OFT_TITLE distributional flag for the word.
|
||||||
|
#
|
||||||
The OFT_TITLE flag records whether a lower-cased version of the word
|
# The OFT_TITLE flag records whether a lower-cased version of the word
|
||||||
is found title-cased (see string.istitle) frequently in a large sample of text,
|
# is found title-cased (see string.istitle) frequently in a large sample of text,
|
||||||
where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
|
# where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
|
||||||
POS tagging).
|
# POS tagging).
|
||||||
|
#
|
||||||
Case statistics are estimated from a large text corpus. Estimates are read
|
# Case statistics are estimated from a large text corpus. Estimates are read
|
||||||
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
|
# from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
|
||||||
|
#
|
||||||
>>> is_oft_upper(lookup(u'john'))
|
# >>> is_oft_upper(lookup(u'john'))
|
||||||
True
|
# True
|
||||||
>>> is_oft_upper(lookup(u'Bill'))
|
# >>> is_oft_upper(lookup(u'Bill'))
|
||||||
False
|
# False
|
||||||
'''
|
# '''
|
||||||
return self.dist_flags & (1 << OFT_TITLE)
|
# return self.dist_flags & (1 << OFT_TITLE)
|
||||||
|
#
|
||||||
|
#
|
||||||
cpdef bint is_alpha(self) except *:
|
# cpdef bint is_alpha(self) except *:
|
||||||
"""Check whether all characters in the word's string are alphabetic.
|
# """Check whether all characters in the word's string are alphabetic.
|
||||||
|
#
|
||||||
Should match the :py:func:`unicode.isalpha()` function.
|
# Should match the :py:func:`unicode.isalpha()` function.
|
||||||
|
#
|
||||||
>>> is_alpha(lookup(u'Hello'))
|
# >>> is_alpha(lookup(u'Hello'))
|
||||||
True
|
# True
|
||||||
>>> is_alpha(lookup(u'العرب'))
|
# >>> is_alpha(lookup(u'العرب'))
|
||||||
True
|
# True
|
||||||
>>> is_alpha(lookup(u'10'))
|
# >>> is_alpha(lookup(u'10'))
|
||||||
False
|
# False
|
||||||
"""
|
# """
|
||||||
return self.orth_flags & 1 << IS_ALPHA
|
# return self.orth_flags & 1 << IS_ALPHA
|
||||||
|
#
|
||||||
cpdef bint is_digit(self) except *:
|
# cpdef bint is_digit(self) except *:
|
||||||
"""Check whether all characters in the word's string are numeric.
|
# """Check whether all characters in the word's string are numeric.
|
||||||
|
#
|
||||||
Should match the :py:func:`unicode.isdigit()` function.
|
# Should match the :py:func:`unicode.isdigit()` function.
|
||||||
|
#
|
||||||
>>> is_digit(lookup(u'10'))
|
# >>> is_digit(lookup(u'10'))
|
||||||
True
|
# True
|
||||||
>>> is_digit(lookup(u'๐'))
|
# >>> is_digit(lookup(u'๐'))
|
||||||
True
|
# True
|
||||||
>>> is_digit(lookup(u'one'))
|
# >>> is_digit(lookup(u'one'))
|
||||||
False
|
# False
|
||||||
"""
|
# """
|
||||||
return self.orth_flags & 1 << IS_DIGIT
|
# return self.orth_flags & 1 << IS_DIGIT
|
||||||
|
#
|
||||||
cpdef bint is_punct(self) except *:
|
# cpdef bint is_punct(self) except *:
|
||||||
"""Check whether all characters belong to a punctuation unicode data category
|
# """Check whether all characters belong to a punctuation unicode data category
|
||||||
for a Lexeme ID.
|
# for a Lexeme ID.
|
||||||
|
#
|
||||||
>>> is_punct(lookup(u'.'))
|
# >>> is_punct(lookup(u'.'))
|
||||||
True
|
# True
|
||||||
>>> is_punct(lookup(u'⁒'))
|
# >>> is_punct(lookup(u'⁒'))
|
||||||
True
|
# True
|
||||||
>>> is_punct(lookup(u' '))
|
# >>> is_punct(lookup(u' '))
|
||||||
False
|
# False
|
||||||
"""
|
# """
|
||||||
return self.orth_flags & 1 << IS_PUNCT
|
# return self.orth_flags & 1 << IS_PUNCT
|
||||||
|
#
|
||||||
cpdef bint is_space(self) except *:
|
# cpdef bint is_space(self) except *:
|
||||||
"""Give the result of unicode.isspace() for a Lexeme ID.
|
# """Give the result of unicode.isspace() for a Lexeme ID.
|
||||||
|
#
|
||||||
>>> is_space(lookup(u'\\t'))
|
# >>> is_space(lookup(u'\\t'))
|
||||||
True
|
# True
|
||||||
>>> is_space(lookup(u'<unicode space>'))
|
# >>> is_space(lookup(u'<unicode space>'))
|
||||||
True
|
# True
|
||||||
>>> is_space(lookup(u'Hi\\n'))
|
# >>> is_space(lookup(u'Hi\\n'))
|
||||||
False
|
# False
|
||||||
"""
|
# """
|
||||||
return self.orth_flags & 1 << IS_SPACE
|
# return self.orth_flags & 1 << IS_SPACE
|
||||||
|
#
|
||||||
cpdef bint is_lower(self) except *:
|
# cpdef bint is_lower(self) except *:
|
||||||
"""Give the result of unicode.islower() for a Lexeme ID.
|
# """Give the result of unicode.islower() for a Lexeme ID.
|
||||||
|
#
|
||||||
>>> is_lower(lookup(u'hi'))
|
# >>> is_lower(lookup(u'hi'))
|
||||||
True
|
# True
|
||||||
>>> is_lower(lookup(<unicode>))
|
# >>> is_lower(lookup(<unicode>))
|
||||||
True
|
# True
|
||||||
>>> is_lower(lookup(u'10'))
|
# >>> is_lower(lookup(u'10'))
|
||||||
False
|
# False
|
||||||
"""
|
# """
|
||||||
return self.orth_flags & 1 << IS_LOWER
|
# return self.orth_flags & 1 << IS_LOWER
|
||||||
|
#
|
||||||
cpdef bint is_upper(self) except *:
|
# cpdef bint is_upper(self) except *:
|
||||||
"""Give the result of unicode.isupper() for a Lexeme ID.
|
# """Give the result of unicode.isupper() for a Lexeme ID.
|
||||||
|
#
|
||||||
>>> is_upper(lookup(u'HI'))
|
# >>> is_upper(lookup(u'HI'))
|
||||||
True
|
# True
|
||||||
>>> is_upper(lookup(u'H10'))
|
# >>> is_upper(lookup(u'H10'))
|
||||||
True
|
# True
|
||||||
>>> is_upper(lookup(u'10'))
|
# >>> is_upper(lookup(u'10'))
|
||||||
False
|
# False
|
||||||
"""
|
# """
|
||||||
return self.orth_flags & 1 << IS_UPPER
|
# return self.orth_flags & 1 << IS_UPPER
|
||||||
|
#
|
||||||
cpdef bint is_title(self) except *:
|
# cpdef bint is_title(self) except *:
|
||||||
"""Give the result of unicode.istitle() for a Lexeme ID.
|
# """Give the result of unicode.istitle() for a Lexeme ID.
|
||||||
|
#
|
||||||
>>> is_title(lookup(u'Hi'))
|
# >>> is_title(lookup(u'Hi'))
|
||||||
True
|
# True
|
||||||
>>> is_title(lookup(u'Hi1'))
|
# >>> is_title(lookup(u'Hi1'))
|
||||||
True
|
# True
|
||||||
>>> is_title(lookup(u'1'))
|
# >>> is_title(lookup(u'1'))
|
||||||
False
|
# False
|
||||||
"""
|
# """
|
||||||
return self.orth_flags & 1 << IS_TITLE
|
# return self.orth_flags & 1 << IS_TITLE
|
||||||
|
#
|
||||||
cpdef bint is_ascii(self) except *:
|
# cpdef bint is_ascii(self) except *:
|
||||||
"""Give the result of checking whether all characters in the string are ascii.
|
# """Give the result of checking whether all characters in the string are ascii.
|
||||||
|
#
|
||||||
>>> is_ascii(lookup(u'Hi'))
|
# >>> is_ascii(lookup(u'Hi'))
|
||||||
True
|
# True
|
||||||
>>> is_ascii(lookup(u' '))
|
# >>> is_ascii(lookup(u' '))
|
||||||
True
|
# True
|
||||||
>>> is_title(lookup(u'<unicode>'))
|
# >>> is_title(lookup(u'<unicode>'))
|
||||||
False
|
# False
|
||||||
"""
|
# """
|
||||||
return self.orth_flags & 1 << IS_ASCII
|
# return self.orth_flags & 1 << IS_ASCII
|
||||||
|
|
Loading…
Reference in New Issue
Block a user