mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* More refactoring
This commit is contained in:
parent
88095666dc
commit
68bae2fec6
10
setup.py
10
setup.py
|
@ -45,13 +45,13 @@ else:
|
|||
|
||||
|
||||
exts = [
|
||||
#Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
|
||||
include_dirs=includes),
|
||||
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.lang", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.word", ["spacy/word.pyx"], language="c++",
|
||||
include_dirs=includes),
|
||||
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
|
||||
include_dirs=includes),
|
||||
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
from libc.stdint cimport uint64_t
|
||||
|
||||
from chartree cimport CharTree
|
||||
|
||||
|
||||
cdef class FixedTable:
|
||||
cdef size_t size
|
||||
cdef uint64_t* keys
|
||||
cdef size_t* values
|
||||
|
||||
cdef size_t insert(self, uint64_t key, size_t value) nogil
|
||||
cdef size_t get(self, uint64_t key) nogil
|
||||
cdef int erase(self, uint64_t key) nogil
|
||||
|
||||
|
||||
cdef class WordTree:
|
||||
cdef size_t max_length
|
||||
cdef size_t default
|
||||
cdef CharTree* _trees
|
||||
cdef dict _dict
|
||||
|
||||
cdef size_t get(self, unicode string) except *
|
||||
cdef int set(self, unicode string, size_t value) except *
|
||||
cdef bint contains(self, unicode string) except *
|
||||
|
|
@ -1,98 +0,0 @@
|
|||
from libc.stdlib cimport calloc, free
|
||||
import cython
|
||||
|
||||
cimport chartree
|
||||
|
||||
|
||||
cdef class FixedTable:
|
||||
def __cinit__(self, const size_t size):
|
||||
self.size = size
|
||||
self.keys = <uint64_t*>calloc(self.size, sizeof(uint64_t))
|
||||
self.values = <size_t*>calloc(self.size, sizeof(size_t))
|
||||
|
||||
def __dealloc__(self):
|
||||
free(self.keys)
|
||||
free(self.values)
|
||||
|
||||
def __getitem__(self, uint64_t key):
|
||||
return self.get(key)
|
||||
|
||||
def __setitem__(self, uint64_t key, size_t value):
|
||||
self.insert(key, value)
|
||||
|
||||
def pop(self, uint64_t key):
|
||||
self.delete(key)
|
||||
|
||||
def bucket(self, uint64_t key):
|
||||
return _find(key, self.size)
|
||||
|
||||
cdef size_t insert(self, uint64_t key, size_t value) nogil:
|
||||
cdef size_t bucket = _find(key, self.size)
|
||||
cdef size_t clobbered
|
||||
if self.values[bucket] == value:
|
||||
clobbered = 0
|
||||
else:
|
||||
clobbered = self.values[bucket]
|
||||
self.keys[bucket] = key
|
||||
self.values[bucket] = value
|
||||
return clobbered
|
||||
|
||||
cdef size_t get(self, uint64_t key) nogil:
|
||||
cdef size_t bucket = _find(key, self.size)
|
||||
if self.keys[bucket] == key:
|
||||
return self.values[bucket]
|
||||
else:
|
||||
return 0
|
||||
|
||||
cdef int erase(self, uint64_t key) nogil:
|
||||
cdef size_t bucket = _find(key, self.size)
|
||||
self.keys[bucket] = 0
|
||||
self.values[bucket] = 0
|
||||
|
||||
|
||||
@cython.cdivision
|
||||
cdef inline size_t _find(uint64_t key, size_t size) nogil:
|
||||
return key % size
|
||||
|
||||
|
||||
cdef class WordTree:
|
||||
def __cinit__(self, size_t default, size_t max_length):
|
||||
self.max_length = max_length
|
||||
self.default = default
|
||||
self._trees = <CharTree*>calloc(max_length, sizeof(CharTree))
|
||||
for i in range(self.max_length):
|
||||
chartree.init(&self._trees[i], i)
|
||||
self._dict = {}
|
||||
|
||||
cdef size_t get(self, unicode ustring) except *:
|
||||
cdef bytes bstring = ustring.encode('utf8')
|
||||
cdef size_t length = len(bstring)
|
||||
if length >= self.max_length:
|
||||
return self._dict.get(bstring, 0)
|
||||
else:
|
||||
return chartree.getitem(&self._trees[length], bstring)
|
||||
|
||||
cdef int set(self, unicode ustring, size_t value) except *:
|
||||
cdef bytes bstring = ustring.encode('utf8')
|
||||
cdef size_t length = len(bstring)
|
||||
if length >= self.max_length:
|
||||
self._dict[bstring] = value
|
||||
else:
|
||||
chartree.setitem(&self._trees[length], bstring, value)
|
||||
|
||||
cdef bint contains(self, unicode ustring) except *:
|
||||
cdef bytes bstring = ustring.encode('utf8')
|
||||
cdef size_t length = len(bstring)
|
||||
if length >= self.max_length:
|
||||
return bstring in self._dict
|
||||
else:
|
||||
return chartree.contains(&self._trees[length], bstring)
|
||||
|
||||
def __getitem__(self, unicode key):
|
||||
return self.get(key)
|
||||
|
||||
def __setitem__(self, unicode key, size_t value):
|
||||
self.set(key, value)
|
||||
|
||||
def __contains__(self, unicode key):
|
||||
return self.contains(key)
|
36
spacy/en.pxd
36
spacy/en.pxd
|
@ -1,15 +1,38 @@
|
|||
from libcpp.vector cimport vector
|
||||
|
||||
from spacy.spacy cimport StringHash
|
||||
|
||||
from spacy.spacy cimport Language
|
||||
from spacy.word cimport LatinWord
|
||||
from spacy.word cimport Lexeme
|
||||
cimport cython
|
||||
|
||||
|
||||
cpdef size_t ALPHA
|
||||
cpdef size_t DIGIT
|
||||
cpdef size_t PUNCT
|
||||
cpdef size_t SPACE
|
||||
cpdef size_t LOWER
|
||||
cpdef size_t UPPER
|
||||
cpdef size_t TITLE
|
||||
cpdef size_t ASCII
|
||||
|
||||
cpdef size_t OFT_LOWER
|
||||
cpdef size_t OFT_TITLE
|
||||
cpdef size_t OFT_UPPER
|
||||
|
||||
cpdef size_t PUNCT
|
||||
cpdef size_t CONJ
|
||||
cpdef size_t NUM
|
||||
cpdef size_t N
|
||||
cpdef size_t DET
|
||||
cpdef size_t ADP
|
||||
cpdef size_t ADJ
|
||||
cpdef size_t ADV
|
||||
cpdef size_t VERB
|
||||
cpdef size_t NOUN
|
||||
cpdef size_t PDT
|
||||
cpdef size_t POS
|
||||
cpdef size_t PRON
|
||||
cpdef size_t PRT
|
||||
|
||||
cdef class English(spacy.Language):
|
||||
cdef int find_split(self, unicode word)
|
||||
cdef LatinWord new_lexeme(self, unicode string)
|
||||
|
||||
|
||||
cdef English EN
|
||||
|
@ -17,4 +40,3 @@ cdef English EN
|
|||
|
||||
cpdef Word lookup(unicode word)
|
||||
cpdef list tokenize(unicode string)
|
||||
cpdef unicode unhash(StringHash hash_value)
|
||||
|
|
102
spacy/en.pyx
102
spacy/en.pyx
|
@ -43,9 +43,85 @@ from libc.stdint cimport uint64_t
|
|||
cimport spacy
|
||||
|
||||
|
||||
# Python-readable flag constants --- can't read an enum from Python
|
||||
|
||||
# Don't want to manually assign these numbers, or we'll insert one and have to
|
||||
# change them all.
|
||||
# Don't use "i", as we don't want it in the global scope!
|
||||
cdef size_t __i = 0
|
||||
|
||||
ALPHA = __i; i += 1
|
||||
DIGIT = __i; __i += 1
|
||||
PUNCT = __i; __i += 1
|
||||
SPACE = __i; __i += 1
|
||||
LOWER = __i; __i += 1
|
||||
UPPER = __i; __i += 1
|
||||
TITLE = __i; __i += 1
|
||||
ASCII = __i; __i += 1
|
||||
|
||||
OFT_LOWER = __i; __i += 1
|
||||
OFT_UPPER = __i; __i += 1
|
||||
OFT_TITLE = __i; __i += 1
|
||||
|
||||
PUNCT = __i; __i += 1
|
||||
CONJ = __i; __i += 1
|
||||
NUM = __i; __i += 1
|
||||
X = __i; __i += 1
|
||||
DET = __i; __i += 1
|
||||
ADP = __i; __i += 1
|
||||
ADJ = __i; __i += 1
|
||||
ADV = __i; __i += 1
|
||||
VERB = __i; __i += 1
|
||||
NOUN = __i; __i += 1
|
||||
PDT = __i; __i += 1
|
||||
POS = __i; __i += 1
|
||||
PRON = __i; __i += 1
|
||||
PRT = __i; __i += 1
|
||||
|
||||
|
||||
# These are for the string views
|
||||
__i = 0
|
||||
SIC = __i; __i += 1
|
||||
CANON_CASED = __i; __i += 1
|
||||
NON_SPARSE = __i; __i += 1
|
||||
SHAPE = __i; __i += 1
|
||||
NR_STRING_VIEWS = __i
|
||||
|
||||
|
||||
def get_string_views(unicode string, lexeme):
|
||||
views = ['' for _ in range(NR_STRING_VIEWS)]
|
||||
views[SIC] = string
|
||||
views[CANON_CASED] = canonicalize_case(string, lexeme)
|
||||
views[SHAPE] = get_string_shape(string)
|
||||
views[NON_SPARSE] = get_non_sparse(string, views[CANON_CASED], views[SHAPE],
|
||||
lexeme)
|
||||
return views
|
||||
|
||||
|
||||
def set_orth_flags(unicode string, flags_t flags)
|
||||
setters = [
|
||||
(ALPHA, is_alpha),
|
||||
(DIGIT, is_digit),
|
||||
(PUNCT, is_punct),
|
||||
(SPACE, is_space),
|
||||
(LOWER, is_lower),
|
||||
(UPPER, is_upper),
|
||||
(SPACE, is_space)
|
||||
]
|
||||
|
||||
for bit, setter in setters:
|
||||
if setter(string):
|
||||
flags |= 1 << bit
|
||||
return flags
|
||||
|
||||
|
||||
|
||||
|
||||
cdef class English(spacy.Language):
|
||||
cdef LatinWord new_lexeme(self, unicode string):
|
||||
return LatinWord(string)
|
||||
cdef Lexeme new_lexeme(self, unicode string, cluster=0, prob=0, case_stats=None,
|
||||
tag_freqs=None):
|
||||
return Lexeme(s, length, views, prob=prob, cluster=cluster,
|
||||
flags=self.get_flags(string))
|
||||
|
||||
cdef int find_split(self, unicode word):
|
||||
cdef size_t length = len(word)
|
||||
|
@ -101,7 +177,7 @@ cpdef list tokenize(unicode string):
|
|||
return EN.tokenize(string)
|
||||
|
||||
|
||||
cpdef Word lookup(unicode string):
|
||||
cpdef Lexeme lookup(unicode string):
|
||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
|
||||
|
||||
Properties of the Lexeme are accessed by passing LexID to the accessor methods.
|
||||
|
@ -116,23 +192,6 @@ cpdef Word lookup(unicode string):
|
|||
return EN.lookup(string)
|
||||
|
||||
|
||||
cpdef unicode unhash(StringHash hash_value):
|
||||
"""Retrieve a string from a hash value. Mostly used for testing.
|
||||
|
||||
In general you should avoid computing with strings, as they are slower than
|
||||
the intended ID-based usage. However, strings can be recovered if necessary,
|
||||
although no control is taken for hash collisions.
|
||||
|
||||
Args:
|
||||
hash_value (StringHash): The hash of a string, returned by Python's hash()
|
||||
function.
|
||||
|
||||
Returns:
|
||||
string (unicode): A unicode string that hashes to the hash_value.
|
||||
"""
|
||||
return EN.unhash(hash_value)
|
||||
|
||||
|
||||
def add_string_views(view_funcs):
|
||||
"""Add a string view to existing and previous lexical entries.
|
||||
|
||||
|
@ -150,16 +209,19 @@ def load_clusters(location):
|
|||
"""
|
||||
pass
|
||||
|
||||
|
||||
def load_unigram_probs(location):
|
||||
"""Load unigram probabilities.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def load_case_stats(location):
|
||||
"""Load case stats.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def load_tag_stats(location):
|
||||
"""Load tag statistics.
|
||||
"""
|
||||
|
|
|
@ -1,16 +1,12 @@
|
|||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport uint64_t
|
||||
from spacy.word cimport Word
|
||||
|
||||
ctypedef uint32_t StringHash
|
||||
|
||||
from spacy.word cimport Lexeme
|
||||
|
||||
|
||||
cdef class Language:
|
||||
cdef object name
|
||||
cdef dict chunks
|
||||
cdef dict vocab
|
||||
cdef dict bacov
|
||||
cdef dict blobs
|
||||
cdef dict lexicon
|
||||
|
||||
cpdef list tokenize(self, unicode text)
|
||||
|
||||
|
@ -20,8 +16,5 @@ cdef class Language:
|
|||
cdef list new_chunk(self, unicode string, list substrings)
|
||||
cdef Word new_lexeme(self, unicode lex)
|
||||
|
||||
cpdef unicode unhash(self, StringHash hashed)
|
||||
|
||||
cpdef list find_substrings(self, unicode chunk)
|
||||
cdef int find_split(self, unicode word)
|
||||
cdef int set_orth(self, unicode string, Word word)
|
|
@ -15,16 +15,13 @@ from libc.stdlib cimport calloc, free
|
|||
from . import util
|
||||
from os import path
|
||||
|
||||
TAGS = {}
|
||||
DIST_FLAGS = {}
|
||||
|
||||
cdef class Language:
|
||||
view_funcs = []
|
||||
def __cinit__(self, name):
|
||||
self.name = name
|
||||
self.bacov = {}
|
||||
self.chunks = {}
|
||||
self.vocab = {}
|
||||
self.blobs = {}
|
||||
self.lexicon = {}
|
||||
self.load_tokenization(util.read_tokenization(name))
|
||||
self.load_dist_info(util.read_dist_info(name))
|
||||
|
||||
|
@ -37,26 +34,26 @@ cdef class Language:
|
|||
string (unicode): The string to split.
|
||||
|
||||
Returns:
|
||||
tokens (Tokens): A Tokens object.
|
||||
tokens (list): A list of Lexeme objects.
|
||||
"""
|
||||
cdef list chunk
|
||||
cdef list blob
|
||||
cdef list tokens = []
|
||||
cdef size_t length = len(string)
|
||||
cdef size_t start = 0
|
||||
cdef size_t i = 0
|
||||
for c in string:
|
||||
if _is_whitespace(c):
|
||||
if c == ' ':
|
||||
if start < i:
|
||||
chunk = self.lookup_chunk(string[start:i])
|
||||
tokens.extend(chunk)
|
||||
blob = self.lookup_blob(string[start:i])
|
||||
tokens.extend(blob)
|
||||
start = i + 1
|
||||
i += 1
|
||||
if start < i:
|
||||
chunk = self.lookup_chunk(string[start:])
|
||||
chunk = self.lookup_blob(string[start:])
|
||||
tokens.extend(chunk)
|
||||
return tokens
|
||||
|
||||
cdef Word lookup(self, unicode string):
|
||||
cdef Lexeme lookup(self, unicode string):
|
||||
assert len(string) != 0
|
||||
cdef Word word
|
||||
if string in self.vocab:
|
||||
|
@ -65,28 +62,26 @@ cdef class Language:
|
|||
word = self.new_lexeme(string)
|
||||
return word
|
||||
|
||||
cdef list lookup_chunk(self, unicode string):
|
||||
cdef list lookup_blob(self, unicode string):
|
||||
cdef list chunk
|
||||
cdef size_t chunk_id
|
||||
if string in self.chunks:
|
||||
chunk = self.chunks[string]
|
||||
cdef size_t blob_id
|
||||
if string in self.blobs:
|
||||
blob = self.blobs[string]
|
||||
else:
|
||||
chunk = self.new_chunk(string, self.find_substrings(string))
|
||||
blob = self.new_blob(string, self.find_substrings(string))
|
||||
return chunk
|
||||
|
||||
cdef list new_chunk(self, unicode string, list substrings):
|
||||
chunk = []
|
||||
cdef list new_blob(self, unicode string, list substrings):
|
||||
blob = []
|
||||
for i, substring in enumerate(substrings):
|
||||
chunk.append(self.lookup(substring))
|
||||
self.chunks[string] = chunk
|
||||
return chunk
|
||||
blob.append(self.lookup(substring))
|
||||
self.blobs[string] = chunk
|
||||
return blob
|
||||
|
||||
cdef Word new_lexeme(self, unicode string):
|
||||
string_views = [view_func(string) for view_func in self.view_funcs]
|
||||
word = Word(string.encode('utf8'), string_views)
|
||||
self.bacov[word.lex] = string
|
||||
self.vocab[string] = word
|
||||
return word
|
||||
# TODO
|
||||
#lexeme = Lexeme(string.encode('utf8'), string_views)
|
||||
#return lexeme
|
||||
|
||||
"""
|
||||
def add_view_funcs(self, list view_funcs):
|
||||
|
@ -112,11 +107,7 @@ cdef class Language:
|
|||
self.bacov[hashed] = view
|
||||
"""
|
||||
|
||||
cpdef unicode unhash(self, StringHash hash_value):
|
||||
'''Fetch a string from the reverse index, given its hash value.'''
|
||||
return self.bacov[hash_value]
|
||||
|
||||
cpdef list find_substrings(self, unicode chunk):
|
||||
cpdef list find_substrings(self, unicode blob):
|
||||
"""Find how to split a chunk into substrings.
|
||||
|
||||
This method calls find_split repeatedly. Most languages will want to
|
||||
|
@ -129,21 +120,18 @@ cdef class Language:
|
|||
substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
|
||||
"""
|
||||
substrings = []
|
||||
while chunk:
|
||||
split = self.find_split(chunk)
|
||||
while blob:
|
||||
split = self.find_split(blob)
|
||||
if split == 0:
|
||||
substrings.append(chunk)
|
||||
substrings.append(blob)
|
||||
break
|
||||
substrings.append(chunk[:split])
|
||||
chunk = chunk[split:]
|
||||
substrings.append(blob[:split])
|
||||
blob = blob[split:]
|
||||
return substrings
|
||||
|
||||
cdef int find_split(self, unicode word):
|
||||
return len(word)
|
||||
|
||||
cdef int set_orth(self, unicode string, Word word):
|
||||
pass
|
||||
|
||||
def load_tokenization(self, token_rules):
|
||||
'''Load special-case tokenization rules.
|
||||
|
||||
|
@ -178,22 +166,3 @@ cdef class Language:
|
|||
w.dist_flags |= DIST_FLAGS[flag]
|
||||
for tag in word_dist.tagdict:
|
||||
w.possible_tags |= TAGS[tag]
|
||||
|
||||
|
||||
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
||||
if c == ' ':
|
||||
return True
|
||||
elif c == '\n':
|
||||
return True
|
||||
elif c == '\t':
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
#cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:
|
||||
# cdef size_t i = 0
|
||||
# while chunk[i] != NULL:
|
||||
# tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])
|
||||
# tokens.length += 1
|
||||
# i += 1
|
|
@ -1,32 +0,0 @@
|
|||
cdef enum OrthFlag:
|
||||
IS_ALPHA
|
||||
IS_DIGIT
|
||||
IS_PUNCT
|
||||
IS_SPACE
|
||||
IS_LOWER
|
||||
IS_UPPER
|
||||
IS_TITLE
|
||||
IS_ASCII
|
||||
|
||||
|
||||
cdef enum:
|
||||
NORM
|
||||
SHAPE
|
||||
LAST3
|
||||
|
||||
from spacy.lexeme cimport LexID
|
||||
from spacy.lexeme cimport StringHash
|
||||
|
||||
cpdef bint is_alpha(LexID lex_id) except *
|
||||
cpdef bint is_digit(LexID lex_id) except *
|
||||
cpdef bint is_punct(LexID lex_id) except *
|
||||
cpdef bint is_space(LexID lex_id) except *
|
||||
cpdef bint is_lower(LexID lex_id) except *
|
||||
cpdef bint is_upper(LexID lex_id) except *
|
||||
cpdef bint is_title(LexID lex_id) except *
|
||||
cpdef bint is_ascii(LexID lex_id) except *
|
||||
|
||||
|
||||
cpdef StringHash norm_of(LexID lex_id) except 0
|
||||
cpdef StringHash shape_of(LexID lex_id) except 0
|
||||
cpdef StringHash last3_of(LexID lex_id) except 0
|
|
@ -1,211 +0,0 @@
|
|||
# cython: embedsignature=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lexeme cimport Lexeme
|
||||
|
||||
def get_normalized(unicode word):
|
||||
"""Todo.
|
||||
|
||||
Args:
|
||||
word (unicode)
|
||||
|
||||
Returns:
|
||||
normalized (unicode)
|
||||
"""
|
||||
if word.isalpha() and word.islower():
|
||||
return word
|
||||
else:
|
||||
return get_word_shape(word)
|
||||
|
||||
|
||||
def get_word_shape(unicode word):
|
||||
"""Todo.
|
||||
|
||||
Args:
|
||||
word (unicode)
|
||||
|
||||
Returns:
|
||||
shape (unicode)
|
||||
"""
|
||||
cdef size_t length = len(word)
|
||||
shape = ""
|
||||
last = ""
|
||||
shape_char = ""
|
||||
seq = 0
|
||||
for c in word:
|
||||
if c.isalpha():
|
||||
if c.isupper():
|
||||
shape_char = "X"
|
||||
else:
|
||||
shape_char = "x"
|
||||
elif c.isdigit():
|
||||
shape_char = "d"
|
||||
else:
|
||||
shape_char = c
|
||||
if shape_char == last:
|
||||
seq += 1
|
||||
else:
|
||||
seq = 0
|
||||
last = shape_char
|
||||
if seq < 3:
|
||||
shape += shape_char
|
||||
assert shape
|
||||
return shape
|
||||
|
||||
|
||||
cpdef unicode get_last3(unicode string):
|
||||
return string[-3:]
|
||||
|
||||
|
||||
cpdef bint is_alpha(LexID lex_id) except *:
|
||||
"""Check whether all characters in the word's string are alphabetic.
|
||||
|
||||
Should match the :py:func:`unicode.isalpha()` function.
|
||||
|
||||
>>> is_alpha(lookup(u'Hello'))
|
||||
True
|
||||
>>> is_alpha(lookup(u'العرب'))
|
||||
True
|
||||
>>> is_alpha(lookup(u'10'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ALPHA
|
||||
|
||||
|
||||
cpdef bint is_digit(LexID lex_id) except *:
|
||||
"""Check whether all characters in the word's string are numeric.
|
||||
|
||||
Should match the :py:func:`unicode.isdigit()` function.
|
||||
|
||||
>>> is_digit(lookup(u'10'))
|
||||
True
|
||||
>>> is_digit(lookup(u'๐'))
|
||||
True
|
||||
>>> is_digit(lookup(u'one'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_DIGIT
|
||||
|
||||
|
||||
cpdef bint is_punct(LexID lex_id) except *:
|
||||
"""Check whether all characters belong to a punctuation unicode data category
|
||||
for a Lexeme ID.
|
||||
|
||||
>>> is_punct(lookup(u'.'))
|
||||
True
|
||||
>>> is_punct(lookup(u'⁒'))
|
||||
True
|
||||
>>> is_punct(lookup(u' '))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_PUNCT
|
||||
|
||||
|
||||
cpdef bint is_space(LexID lex_id) except *:
|
||||
"""Give the result of unicode.isspace() for a Lexeme ID.
|
||||
|
||||
>>> is_space(lookup(u'\\t'))
|
||||
True
|
||||
>>> is_space(lookup(u'<unicode space>'))
|
||||
True
|
||||
>>> is_space(lookup(u'Hi\\n'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
|
||||
|
||||
|
||||
cpdef bint is_lower(LexID lex_id) except *:
|
||||
"""Give the result of unicode.islower() for a Lexeme ID.
|
||||
|
||||
>>> is_lower(lookup(u'hi'))
|
||||
True
|
||||
>>> is_lower(lookup(<unicode>))
|
||||
True
|
||||
>>> is_lower(lookup(u'10'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_LOWER
|
||||
|
||||
|
||||
cpdef bint is_upper(LexID lex_id) except *:
|
||||
"""Give the result of unicode.isupper() for a Lexeme ID.
|
||||
|
||||
>>> is_upper(lookup(u'HI'))
|
||||
True
|
||||
>>> is_upper(lookup(u'H10'))
|
||||
True
|
||||
>>> is_upper(lookup(u'10'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_UPPER
|
||||
|
||||
|
||||
cpdef bint is_title(LexID lex_id) except *:
|
||||
"""Give the result of unicode.istitle() for a Lexeme ID.
|
||||
|
||||
>>> is_title(lookup(u'Hi'))
|
||||
True
|
||||
>>> is_title(lookup(u'Hi1'))
|
||||
True
|
||||
>>> is_title(lookup(u'1'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_TITLE
|
||||
|
||||
|
||||
cpdef bint is_ascii(LexID lex_id) except *:
|
||||
"""Give the result of checking whether all characters in the string are ascii.
|
||||
|
||||
>>> is_ascii(lookup(u'Hi'))
|
||||
True
|
||||
>>> is_ascii(lookup(u' '))
|
||||
True
|
||||
>>> is_title(lookup(u'<unicode>'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ASCII
|
||||
|
||||
|
||||
cpdef StringHash norm_of(LexID lex_id) except 0:
|
||||
"""Return the hash of a "normalized" version of the string.
|
||||
|
||||
Normalized strings are intended to be less sparse, while still capturing
|
||||
important lexical information. See :py:func:`spacy.latin.orthography.normalize_string`
|
||||
for details of the normalization function.
|
||||
|
||||
>>> unhash(norm_of(lookupu'Hi'))
|
||||
u'hi'
|
||||
>>> unhash(norm_of(lookup(u'255667')))
|
||||
u'shape=dddd'
|
||||
>>> unhash(norm_of(lookup(u'...')))
|
||||
u'...'
|
||||
"""
|
||||
return (<Lexeme*>lex_id).string_views[NORM]
|
||||
|
||||
|
||||
cpdef StringHash shape_of(LexID lex_id) except 0:
|
||||
"""Return the hash of a string describing the word's "orthograpgic shape".
|
||||
|
||||
Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
|
||||
function. Word shape features have been found useful for NER and POS tagging,
|
||||
e.g. Manning (2011)
|
||||
|
||||
>>> unhash(shape_of(lookupu'Hi'))
|
||||
u'Xx'
|
||||
>>> unhash(shape_of(lookup(u'255667')))
|
||||
u'dddd'
|
||||
>>> unhash(shape_of(lookup(u'...')))
|
||||
u'...'
|
||||
"""
|
||||
cdef Lexeme* w = <Lexeme*>lex_id
|
||||
return w.string_views[SHAPE]
|
||||
|
||||
|
||||
cpdef StringHash last3_of(LexID lex_id) except 0:
|
||||
'''Return the hash of string[-3:], i.e. the last three characters of the word.
|
||||
|
||||
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
|
||||
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
||||
[u'llo', u'!']
|
||||
'''
|
||||
return (<Lexeme*>lex_id).string_views[LAST3]
|
|
@ -1,7 +0,0 @@
|
|||
cpdef bytes to_bytes(unicode string)
|
||||
|
||||
cpdef unicode from_bytes(bytes string)
|
||||
|
||||
cpdef unicode substr(unicode string, int start, int end, size_t length)
|
||||
|
||||
cdef bint is_whitespace(Py_UNICODE c)
|
|
@ -1,35 +0,0 @@
|
|||
# cython: profile=True
|
||||
|
||||
cpdef bytes to_bytes(unicode string):
|
||||
return string.encode('utf8')
|
||||
|
||||
|
||||
cpdef unicode from_bytes(bytes string):
|
||||
return string.decode('utf8')
|
||||
|
||||
|
||||
cpdef unicode substr(unicode string, int start, int end, size_t length):
|
||||
if end >= length:
|
||||
end = -1
|
||||
if start >= length:
|
||||
start = 0
|
||||
if start <= 0 and end < 0:
|
||||
return string
|
||||
elif start < 0:
|
||||
start = 0
|
||||
elif end < 0:
|
||||
end = length
|
||||
return string[start:end]
|
||||
|
||||
|
||||
cdef bint is_whitespace(Py_UNICODE c):
|
||||
# TODO: Support other unicode spaces
|
||||
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
|
||||
if c == u' ':
|
||||
return True
|
||||
elif c == u'\n':
|
||||
return True
|
||||
elif c == u'\t':
|
||||
return True
|
||||
else:
|
||||
return False
|
|
@ -1,18 +0,0 @@
|
|||
from libcpp.vector cimport vector
|
||||
from spacy.lexeme cimport LexID
|
||||
from spacy.lexeme cimport Lexeme
|
||||
|
||||
from cython.operator cimport dereference as deref
|
||||
from spacy.spacy cimport Language
|
||||
|
||||
|
||||
cdef class Tokens:
|
||||
cdef Language lang
|
||||
cdef vector[LexID]* vctr
|
||||
cdef size_t length
|
||||
|
||||
cpdef int append(self, LexID token)
|
||||
cpdef int extend(self, Tokens other) except -1
|
||||
|
||||
cpdef object group_by(self, size_t attr)
|
||||
cpdef dict count_by(self, size_t attr)
|
|
@ -1,92 +0,0 @@
|
|||
from cython.operator cimport dereference as deref
|
||||
from cython.operator cimport preincrement as inc
|
||||
|
||||
|
||||
from spacy.lexeme cimport Lexeme
|
||||
from spacy.spacy cimport StringHash
|
||||
|
||||
|
||||
cdef class Tokens:
|
||||
def __cinit__(self, Language lang):
|
||||
self.lang = lang
|
||||
self.vctr = new vector[LexID]()
|
||||
self.length = 0
|
||||
|
||||
def __dealloc__(self):
|
||||
del self.vctr
|
||||
|
||||
def __iter__(self):
|
||||
cdef vector[LexID].iterator it = self.vctr[0].begin()
|
||||
while it != self.vctr[0].end():
|
||||
yield deref(it)
|
||||
inc(it)
|
||||
|
||||
def __getitem__(self, size_t idx):
|
||||
return self.vctr[0].at(idx)
|
||||
|
||||
def __len__(self):
|
||||
return self.length
|
||||
|
||||
cpdef int append(self, LexID token):
|
||||
self.vctr[0].push_back(token)
|
||||
self.length += 1
|
||||
|
||||
cpdef int extend(self, Tokens other) except -1:
|
||||
cdef LexID el
|
||||
for el in other:
|
||||
self.append(el)
|
||||
|
||||
cpdef object group_by(self, size_t view_idx):
|
||||
'''Group tokens that share the property attr into Tokens instances, and
|
||||
return a list of them. Returns a tuple of three lists:
|
||||
|
||||
(string names, hashes, tokens)
|
||||
|
||||
The lists are aligned, so the ith entry in string names is the string
|
||||
that the ith entry in hashes unhashes to, which the Tokens instance
|
||||
is grouped by.
|
||||
|
||||
You can then use count_by or group_by on the Tokens
|
||||
for further processing. Calling group_by and then asking the length
|
||||
of the Tokens objects is equivalent to count_by, but somewhat slower.
|
||||
'''
|
||||
# Implementation here is working around some of the constraints in
|
||||
# Cython about what type of thing can go in what type of container.
|
||||
# Long story short, it's pretty hard to get a Python object like
|
||||
# Tokens into a vector or array. If we really need this to run faster,
|
||||
# we can be tricky and get the Python list access out of the loop. What
|
||||
# we'd do is store pointers to the underlying vectors.
|
||||
# So far, speed isn't mattering here.
|
||||
cdef dict indices = {}
|
||||
cdef list groups = []
|
||||
cdef list names = []
|
||||
cdef list hashes = []
|
||||
|
||||
cdef StringHash key
|
||||
cdef LexID t
|
||||
for t in self.vctr[0]:
|
||||
if view_idx == 0:
|
||||
key = (<Lexeme*>t).lex
|
||||
else:
|
||||
key = (<Lexeme*>t).string_views[view_idx - 1]
|
||||
if key in indices:
|
||||
groups[indices[key]].append(t)
|
||||
else:
|
||||
indices[key] = len(groups)
|
||||
groups.append(Tokens(self.lang))
|
||||
names.append(self.lang.unhash(key))
|
||||
hashes.append(key)
|
||||
groups[-1].append(t)
|
||||
return names, hashes, groups
|
||||
|
||||
cpdef dict count_by(self, size_t attr):
|
||||
counts = {}
|
||||
cdef LexID t
|
||||
cdef StringHash key
|
||||
for t in self.vctr[0]:
|
||||
#key = attr_of(t, attr)
|
||||
key = 0
|
||||
if key not in counts:
|
||||
counts[key] = 0
|
||||
counts[key] += 1
|
||||
return counts
|
|
@ -1,59 +1,25 @@
|
|||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
ctypedef int ClusterID
|
||||
ctypedef uint32_t StringHash
|
||||
ctypedef size_t LexID
|
||||
ctypedef char OrthFlags
|
||||
ctypedef char DistFlags
|
||||
ctypedef uint64_t TagFlags
|
||||
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
||||
|
||||
|
||||
cdef enum OrthFlag:
|
||||
IS_ALPHA
|
||||
IS_DIGIT
|
||||
IS_PUNCT
|
||||
IS_SPACE
|
||||
IS_LOWER
|
||||
IS_UPPER
|
||||
IS_TITLE
|
||||
IS_ASCII
|
||||
DEF MAX_FLAG = 64
|
||||
|
||||
|
||||
cdef enum:
|
||||
NORM
|
||||
SHAPE
|
||||
LAST3
|
||||
|
||||
|
||||
cdef class Word:
|
||||
cdef class Lexeme:
|
||||
# NB: the readonly keyword refers to _Python_ access. The attributes are
|
||||
# writeable from Cython.
|
||||
cdef readonly StringHash key
|
||||
cdef readonly char** utf8_strings
|
||||
cdef readonly id_t id
|
||||
cdef readonly size_t length
|
||||
cdef readonly double prob
|
||||
cdef readonly ClusterID cluster
|
||||
cdef readonly TagFlags possible_tags
|
||||
cdef readonly DistFlags dist_flags
|
||||
cdef readonly OrthFlags orth_flags
|
||||
cdef readonly size_t cluster
|
||||
|
||||
cpdef StringHash get_view(self, size_t i) except 0
|
||||
cdef readonly utf8_t* strings
|
||||
cdef readonly size_t nr_strings
|
||||
|
||||
cdef readonly flag_t flags
|
||||
|
||||
cdef class CasedWord(Word):
|
||||
cpdef bint can_tag(self, TagFlags flag) except *
|
||||
cpdef bint check_dist_flag(self, DistFlags flag) except *
|
||||
cpdef bint check_orth_flag(self, OrthFlags flag) except *
|
||||
|
||||
cpdef bint is_often_titled(self) except *
|
||||
cpdef bint is_often_uppered(self) except *
|
||||
|
||||
cpdef bint is_alpha(self) except *
|
||||
cpdef bint is_digit(self) except *
|
||||
cpdef bint is_punct(self) except *
|
||||
cpdef bint is_space(self) except *
|
||||
cpdef bint is_lower(self) except *
|
||||
cpdef bint is_upper(self) except *
|
||||
cpdef bint is_title(self) except *
|
||||
cpdef bint is_ascii(self) except *
|
||||
cpdef bint check_flag(self, size_t flag_id) except *
|
||||
cpdef int set_flag(self, size_t flag_id) except -1
|
||||
|
||||
cpdef unicode get_string(self, size_t i) except *
|
||||
cpdef id_t get_id(self, size_t i) except 0
|
||||
cpdef int add_strings(self, list strings) except -1
|
||||
|
|
394
spacy/word.pyx
394
spacy/word.pyx
|
@ -4,40 +4,32 @@
|
|||
|
||||
from libc.stdlib cimport calloc, free
|
||||
|
||||
|
||||
# Python-visible enum for POS tags
|
||||
PUNCT = 0
|
||||
CONJ = 1
|
||||
NUM = 2
|
||||
X = 3
|
||||
DET = 4
|
||||
ADP = 5
|
||||
ADJ = 6
|
||||
ADV = 7
|
||||
VERB = 8
|
||||
NOUN = 9
|
||||
PDT = 10
|
||||
POS = 11
|
||||
PRON = 12
|
||||
PRT = 13
|
||||
from spacy cimport flags
|
||||
|
||||
|
||||
DEF OFT_UPPER = 1
|
||||
DEF OFT_TITLE = 2
|
||||
|
||||
|
||||
cdef class Word:
|
||||
cdef class Lexeme:
|
||||
"""A lexical type.
|
||||
|
||||
Clients should avoid instantiating Lexemes directly, and instead use get_lexeme
|
||||
from a language module, e.g. spacy.en.get_lexeme . This allows us to use only
|
||||
one Lexeme object per lexical type.
|
||||
|
||||
Attributes:
|
||||
string (bytes):
|
||||
A utf8-encoded byte-string for the word.
|
||||
|
||||
lex (StringHash):
|
||||
A hash of the word.
|
||||
id (view_id_t):
|
||||
A unique ID of the word's string.
|
||||
|
||||
Implemented as the memory-address of the string,
|
||||
as we use Python's string interning to guarantee that only one copy
|
||||
of each string is seen.
|
||||
|
||||
string (unicode):
|
||||
The unicode string.
|
||||
|
||||
Implemented as a property; relatively expensive.
|
||||
|
||||
length (size_t):
|
||||
The (unicode) length of the word.
|
||||
|
||||
The number of unicode code-points in the string.
|
||||
|
||||
prob (double):
|
||||
An estimate of the word's unigram log probability.
|
||||
|
||||
|
@ -60,186 +52,194 @@ cdef class Word:
|
|||
while "dapple" is totally different. On the other hand, "scalable" receives
|
||||
the same cluster ID as "pineapple", which is not what we'd like.
|
||||
"""
|
||||
def __cinit__(self, bytes string, list string_views, prob=0.0, cluster=0,
|
||||
orth_flags=0, dist_flags=0, possible_tags=0):
|
||||
self.string = <char*>string
|
||||
self.length = len(string)
|
||||
self.views = <char**>calloc(len(string_views), sizeof(StringHash))
|
||||
cdef unicode view
|
||||
for i in range(len(string_views)):
|
||||
view = string_views[i]
|
||||
self.string_views[i] = hash(view)
|
||||
def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0,
|
||||
cluster=0, orth_flags=0, dist_flags=0, possible_tags=0):
|
||||
self.id = <id_t>&string
|
||||
self.length = length
|
||||
self.nr_strings = 0
|
||||
self.add_views(views)
|
||||
|
||||
def __dealloc__(self):
|
||||
free(self.string_views)
|
||||
free(self.views)
|
||||
|
||||
cpdef StringHash get_view(self, size_t i) except 0:
|
||||
return self.string_views[i]
|
||||
property string:
|
||||
def __get__(self):
|
||||
return self.strings[0].decode('utf8')
|
||||
|
||||
cpdef bint check_orth_flag(self, OrthFlags flag) except *:
|
||||
"""Access the value of one of the pre-computed boolean orthographic features.
|
||||
cpdef unicode get_view_string(self, size_t i) except *:
|
||||
assert i < self.nr_strings
|
||||
return self.strings[i].decode('utf8')
|
||||
|
||||
Meanings depend on the language-specific orthographic features being loaded.
|
||||
The suggested features for latin-alphabet languages are: TODO
|
||||
"""
|
||||
return self.orth_flags & (1 << flag)
|
||||
cpdef intptr_t get_view_id(self, size_t i) except 0:
|
||||
assert i < self.nr_strings
|
||||
return <string_id_t>&self.views[i]
|
||||
|
||||
cpdef bint check_dist_flag(self, DistFlags flag) except *:
|
||||
cpdef int add_views(self, list views) except -1:
|
||||
self.nr_views += len(strings)
|
||||
self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t))
|
||||
cdef unicode view
|
||||
cdef bytes utf8_string
|
||||
for i, view in enumerate(strings):
|
||||
view = string_views[i]
|
||||
utf8_string = view.encode('utf8')
|
||||
# Intern strings, allowing pointer comparison
|
||||
utf8_string = intern(utf8_string)
|
||||
self.views[i] = utf8_string
|
||||
|
||||
cpdef bint check_flag(self, size_t flag_id) except *:
|
||||
"""Access the value of one of the pre-computed boolean distribution features.
|
||||
|
||||
Meanings depend on the language-specific distributional features being loaded.
|
||||
The suggested features for latin-alphabet languages are: TODO
|
||||
"""
|
||||
|
||||
return self.dist_flags & (1 << flag)
|
||||
assert flag_id < flags.MAX_FLAG
|
||||
return self.flags & (1 << flag_id)
|
||||
|
||||
cpdef bint can_tag(self, TagFlags flag) except *:
|
||||
"""Check whether the word often receives a particular tag in a large text
|
||||
corpus. "Often" is chosen by heuristic.
|
||||
"""
|
||||
return self.possible_tags & (1 << flag)
|
||||
cpdef int set_flag(self, size_t flag_id) except -1:
|
||||
assert flag_id < flags.MAX_FLAG
|
||||
self.flags |= (1 << flag_id)
|
||||
|
||||
|
||||
cdef class CasedWord(Word):
|
||||
def __cinit__(self, bytes string):
|
||||
string_views = [get_normaized(string), get_word_shape(string), string[-3:]]
|
||||
Word.__cinit__(self, string, string_views)
|
||||
|
||||
cpdef bint is_often_uppered(self) except *:
|
||||
'''Check the OFT_UPPER distributional flag for the word.
|
||||
|
||||
The OFT_UPPER flag records whether a lower-cased version of the word
|
||||
is found in all-upper case frequently in a large sample of text, where
|
||||
"frequently" is defined as P >= 0.95 (chosen for high mutual information for
|
||||
POS tagging).
|
||||
|
||||
Case statistics are estimated from a large text corpus. Estimates are read
|
||||
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
|
||||
|
||||
>>> is_often_uppered(lookup(u'nato'))
|
||||
True
|
||||
>>> is_often_uppered(lookup(u'the'))
|
||||
False
|
||||
'''
|
||||
return self.dist_flags & (1 << OFT_UPPER)
|
||||
|
||||
|
||||
cpdef bint is_often_titled(self) except *:
|
||||
'''Check the OFT_TITLE distributional flag for the word.
|
||||
|
||||
The OFT_TITLE flag records whether a lower-cased version of the word
|
||||
is found title-cased (see string.istitle) frequently in a large sample of text,
|
||||
where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
|
||||
POS tagging).
|
||||
|
||||
Case statistics are estimated from a large text corpus. Estimates are read
|
||||
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
|
||||
|
||||
>>> is_oft_upper(lookup(u'john'))
|
||||
True
|
||||
>>> is_oft_upper(lookup(u'Bill'))
|
||||
False
|
||||
'''
|
||||
return self.dist_flags & (1 << OFT_TITLE)
|
||||
|
||||
|
||||
cpdef bint is_alpha(self) except *:
|
||||
"""Check whether all characters in the word's string are alphabetic.
|
||||
|
||||
Should match the :py:func:`unicode.isalpha()` function.
|
||||
|
||||
>>> is_alpha(lookup(u'Hello'))
|
||||
True
|
||||
>>> is_alpha(lookup(u'العرب'))
|
||||
True
|
||||
>>> is_alpha(lookup(u'10'))
|
||||
False
|
||||
"""
|
||||
return self.orth_flags & 1 << IS_ALPHA
|
||||
|
||||
cpdef bint is_digit(self) except *:
|
||||
"""Check whether all characters in the word's string are numeric.
|
||||
|
||||
Should match the :py:func:`unicode.isdigit()` function.
|
||||
|
||||
>>> is_digit(lookup(u'10'))
|
||||
True
|
||||
>>> is_digit(lookup(u'๐'))
|
||||
True
|
||||
>>> is_digit(lookup(u'one'))
|
||||
False
|
||||
"""
|
||||
return self.orth_flags & 1 << IS_DIGIT
|
||||
|
||||
cpdef bint is_punct(self) except *:
|
||||
"""Check whether all characters belong to a punctuation unicode data category
|
||||
for a Lexeme ID.
|
||||
|
||||
>>> is_punct(lookup(u'.'))
|
||||
True
|
||||
>>> is_punct(lookup(u'⁒'))
|
||||
True
|
||||
>>> is_punct(lookup(u' '))
|
||||
False
|
||||
"""
|
||||
return self.orth_flags & 1 << IS_PUNCT
|
||||
|
||||
cpdef bint is_space(self) except *:
|
||||
"""Give the result of unicode.isspace() for a Lexeme ID.
|
||||
|
||||
>>> is_space(lookup(u'\\t'))
|
||||
True
|
||||
>>> is_space(lookup(u'<unicode space>'))
|
||||
True
|
||||
>>> is_space(lookup(u'Hi\\n'))
|
||||
False
|
||||
"""
|
||||
return self.orth_flags & 1 << IS_SPACE
|
||||
|
||||
cpdef bint is_lower(self) except *:
|
||||
"""Give the result of unicode.islower() for a Lexeme ID.
|
||||
|
||||
>>> is_lower(lookup(u'hi'))
|
||||
True
|
||||
>>> is_lower(lookup(<unicode>))
|
||||
True
|
||||
>>> is_lower(lookup(u'10'))
|
||||
False
|
||||
"""
|
||||
return self.orth_flags & 1 << IS_LOWER
|
||||
|
||||
cpdef bint is_upper(self) except *:
|
||||
"""Give the result of unicode.isupper() for a Lexeme ID.
|
||||
|
||||
>>> is_upper(lookup(u'HI'))
|
||||
True
|
||||
>>> is_upper(lookup(u'H10'))
|
||||
True
|
||||
>>> is_upper(lookup(u'10'))
|
||||
False
|
||||
"""
|
||||
return self.orth_flags & 1 << IS_UPPER
|
||||
|
||||
cpdef bint is_title(self) except *:
|
||||
"""Give the result of unicode.istitle() for a Lexeme ID.
|
||||
|
||||
>>> is_title(lookup(u'Hi'))
|
||||
True
|
||||
>>> is_title(lookup(u'Hi1'))
|
||||
True
|
||||
>>> is_title(lookup(u'1'))
|
||||
False
|
||||
"""
|
||||
return self.orth_flags & 1 << IS_TITLE
|
||||
|
||||
cpdef bint is_ascii(self) except *:
|
||||
"""Give the result of checking whether all characters in the string are ascii.
|
||||
|
||||
>>> is_ascii(lookup(u'Hi'))
|
||||
True
|
||||
>>> is_ascii(lookup(u' '))
|
||||
True
|
||||
>>> is_title(lookup(u'<unicode>'))
|
||||
False
|
||||
"""
|
||||
return self.orth_flags & 1 << IS_ASCII
|
||||
#
|
||||
#cdef class CasedWord(Word):
|
||||
# def __cinit__(self, bytes string, list views):
|
||||
# Word.__cinit__(self, string, string_views)
|
||||
#
|
||||
# cpdef bint is_often_uppered(self) except *:
|
||||
# '''Check the OFT_UPPER distributional flag for the word.
|
||||
#
|
||||
# The OFT_UPPER flag records whether a lower-cased version of the word
|
||||
# is found in all-upper case frequently in a large sample of text, where
|
||||
# "frequently" is defined as P >= 0.95 (chosen for high mutual information for
|
||||
# POS tagging).
|
||||
#
|
||||
# Case statistics are estimated from a large text corpus. Estimates are read
|
||||
# from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
|
||||
#
|
||||
# >>> is_often_uppered(lookup(u'nato'))
|
||||
# True
|
||||
# >>> is_often_uppered(lookup(u'the'))
|
||||
# False
|
||||
# '''
|
||||
# return self.dist_flags & (1 << OFT_UPPER)
|
||||
#
|
||||
#
|
||||
# cpdef bint is_often_titled(self) except *:
|
||||
# '''Check the OFT_TITLE distributional flag for the word.
|
||||
#
|
||||
# The OFT_TITLE flag records whether a lower-cased version of the word
|
||||
# is found title-cased (see string.istitle) frequently in a large sample of text,
|
||||
# where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
|
||||
# POS tagging).
|
||||
#
|
||||
# Case statistics are estimated from a large text corpus. Estimates are read
|
||||
# from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
|
||||
#
|
||||
# >>> is_oft_upper(lookup(u'john'))
|
||||
# True
|
||||
# >>> is_oft_upper(lookup(u'Bill'))
|
||||
# False
|
||||
# '''
|
||||
# return self.dist_flags & (1 << OFT_TITLE)
|
||||
#
|
||||
#
|
||||
# cpdef bint is_alpha(self) except *:
|
||||
# """Check whether all characters in the word's string are alphabetic.
|
||||
#
|
||||
# Should match the :py:func:`unicode.isalpha()` function.
|
||||
#
|
||||
# >>> is_alpha(lookup(u'Hello'))
|
||||
# True
|
||||
# >>> is_alpha(lookup(u'العرب'))
|
||||
# True
|
||||
# >>> is_alpha(lookup(u'10'))
|
||||
# False
|
||||
# """
|
||||
# return self.orth_flags & 1 << IS_ALPHA
|
||||
#
|
||||
# cpdef bint is_digit(self) except *:
|
||||
# """Check whether all characters in the word's string are numeric.
|
||||
#
|
||||
# Should match the :py:func:`unicode.isdigit()` function.
|
||||
#
|
||||
# >>> is_digit(lookup(u'10'))
|
||||
# True
|
||||
# >>> is_digit(lookup(u'๐'))
|
||||
# True
|
||||
# >>> is_digit(lookup(u'one'))
|
||||
# False
|
||||
# """
|
||||
# return self.orth_flags & 1 << IS_DIGIT
|
||||
#
|
||||
# cpdef bint is_punct(self) except *:
|
||||
# """Check whether all characters belong to a punctuation unicode data category
|
||||
# for a Lexeme ID.
|
||||
#
|
||||
# >>> is_punct(lookup(u'.'))
|
||||
# True
|
||||
# >>> is_punct(lookup(u'⁒'))
|
||||
# True
|
||||
# >>> is_punct(lookup(u' '))
|
||||
# False
|
||||
# """
|
||||
# return self.orth_flags & 1 << IS_PUNCT
|
||||
#
|
||||
# cpdef bint is_space(self) except *:
|
||||
# """Give the result of unicode.isspace() for a Lexeme ID.
|
||||
#
|
||||
# >>> is_space(lookup(u'\\t'))
|
||||
# True
|
||||
# >>> is_space(lookup(u'<unicode space>'))
|
||||
# True
|
||||
# >>> is_space(lookup(u'Hi\\n'))
|
||||
# False
|
||||
# """
|
||||
# return self.orth_flags & 1 << IS_SPACE
|
||||
#
|
||||
# cpdef bint is_lower(self) except *:
|
||||
# """Give the result of unicode.islower() for a Lexeme ID.
|
||||
#
|
||||
# >>> is_lower(lookup(u'hi'))
|
||||
# True
|
||||
# >>> is_lower(lookup(<unicode>))
|
||||
# True
|
||||
# >>> is_lower(lookup(u'10'))
|
||||
# False
|
||||
# """
|
||||
# return self.orth_flags & 1 << IS_LOWER
|
||||
#
|
||||
# cpdef bint is_upper(self) except *:
|
||||
# """Give the result of unicode.isupper() for a Lexeme ID.
|
||||
#
|
||||
# >>> is_upper(lookup(u'HI'))
|
||||
# True
|
||||
# >>> is_upper(lookup(u'H10'))
|
||||
# True
|
||||
# >>> is_upper(lookup(u'10'))
|
||||
# False
|
||||
# """
|
||||
# return self.orth_flags & 1 << IS_UPPER
|
||||
#
|
||||
# cpdef bint is_title(self) except *:
|
||||
# """Give the result of unicode.istitle() for a Lexeme ID.
|
||||
#
|
||||
# >>> is_title(lookup(u'Hi'))
|
||||
# True
|
||||
# >>> is_title(lookup(u'Hi1'))
|
||||
# True
|
||||
# >>> is_title(lookup(u'1'))
|
||||
# False
|
||||
# """
|
||||
# return self.orth_flags & 1 << IS_TITLE
|
||||
#
|
||||
# cpdef bint is_ascii(self) except *:
|
||||
# """Give the result of checking whether all characters in the string are ascii.
|
||||
#
|
||||
# >>> is_ascii(lookup(u'Hi'))
|
||||
# True
|
||||
# >>> is_ascii(lookup(u' '))
|
||||
# True
|
||||
# >>> is_title(lookup(u'<unicode>'))
|
||||
# False
|
||||
# """
|
||||
# return self.orth_flags & 1 << IS_ASCII
|
||||
|
|
Loading…
Reference in New Issue
Block a user