* Refactoring with Lexeme as a class now compiles. Basic design seems to work

This commit is contained in:
Matthew Honnibal 2014-08-27 17:15:39 +02:00
parent 68bae2fec6
commit e9a62b6eba
7 changed files with 196 additions and 484 deletions

View File

@ -1,4 +1,4 @@
from spacy.spacy cimport Language from spacy.lang cimport Language
from spacy.word cimport Lexeme from spacy.word cimport Lexeme
cimport cython cimport cython
@ -31,12 +31,14 @@ cpdef size_t POS
cpdef size_t PRON cpdef size_t PRON
cpdef size_t PRT cpdef size_t PRT
cdef class English(spacy.Language): cpdef size_t SIC
cdef int find_split(self, unicode word) cpdef size_t CANON_CASED
cpdef size_t SHAPE
cpdef size_t NON_SPARSE
cdef English EN cdef class English(Language):
cpdef int _split_one(self, unicode word)
cpdef Word lookup(unicode word) cpdef English EN
cpdef list tokenize(unicode string)

View File

@ -31,6 +31,7 @@ same scheme. Tokenization problems are a major cause of poor performance for
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
provides a fully Penn Treebank 3-compliant tokenizer. provides a fully Penn Treebank 3-compliant tokenizer.
''' '''
# TODO
#The script translate_treebank_tokenization can be used to transform a treebank's #The script translate_treebank_tokenization can be used to transform a treebank's
#annotation to use one of the spacy tokenization schemes. #annotation to use one of the spacy tokenization schemes.
@ -40,90 +41,14 @@ from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
cimport spacy cimport lang
from spacy import orth
# Python-readable flag constants --- can't read an enum from Python
# Don't want to manually assign these numbers, or we'll insert one and have to
# change them all.
# Don't use "i", as we don't want it in the global scope!
cdef size_t __i = 0
ALPHA = __i; i += 1
DIGIT = __i; __i += 1
PUNCT = __i; __i += 1
SPACE = __i; __i += 1
LOWER = __i; __i += 1
UPPER = __i; __i += 1
TITLE = __i; __i += 1
ASCII = __i; __i += 1
OFT_LOWER = __i; __i += 1
OFT_UPPER = __i; __i += 1
OFT_TITLE = __i; __i += 1
PUNCT = __i; __i += 1
CONJ = __i; __i += 1
NUM = __i; __i += 1
X = __i; __i += 1
DET = __i; __i += 1
ADP = __i; __i += 1
ADJ = __i; __i += 1
ADV = __i; __i += 1
VERB = __i; __i += 1
NOUN = __i; __i += 1
PDT = __i; __i += 1
POS = __i; __i += 1
PRON = __i; __i += 1
PRT = __i; __i += 1
# These are for the string views
__i = 0
SIC = __i; __i += 1
CANON_CASED = __i; __i += 1
NON_SPARSE = __i; __i += 1
SHAPE = __i; __i += 1
NR_STRING_VIEWS = __i
def get_string_views(unicode string, lexeme):
views = ['' for _ in range(NR_STRING_VIEWS)]
views[SIC] = string
views[CANON_CASED] = canonicalize_case(string, lexeme)
views[SHAPE] = get_string_shape(string)
views[NON_SPARSE] = get_non_sparse(string, views[CANON_CASED], views[SHAPE],
lexeme)
return views
def set_orth_flags(unicode string, flags_t flags)
setters = [
(ALPHA, is_alpha),
(DIGIT, is_digit),
(PUNCT, is_punct),
(SPACE, is_space),
(LOWER, is_lower),
(UPPER, is_upper),
(SPACE, is_space)
]
for bit, setter in setters:
if setter(string):
flags |= 1 << bit
return flags
cdef class English(Language):
cdef class English(spacy.Language): cpdef int _split_one(self, unicode word):
cdef Lexeme new_lexeme(self, unicode string, cluster=0, prob=0, case_stats=None,
tag_freqs=None):
return Lexeme(s, length, views, prob=prob, cluster=cluster,
flags=self.get_flags(string))
cdef int find_split(self, unicode word):
cdef size_t length = len(word) cdef size_t length = len(word)
cdef int i = 0 cdef int i = 0
if word.startswith("'s") or word.startswith("'S"): if word.startswith("'s") or word.startswith("'S"):
@ -132,17 +57,16 @@ cdef class English(spacy.Language):
if word.endswith("'s") and length >= 3: if word.endswith("'s") and length >= 3:
return length - 2 return length - 2
# Leading punctuation # Leading punctuation
if check_punct(word, 0, length): if _check_punct(word, 0, length):
return 1 return 1
elif length >= 1: elif length >= 1:
# Split off all trailing punctuation characters # Split off all trailing punctuation characters
i = 0 i = 0
while i < length and not check_punct(word, i, length): while i < length and not _check_punct(word, i, length):
i += 1 i += 1
return i return i
cdef bint _check_punct(unicode word, size_t i, size_t length):
cdef bint check_punct(unicode word, size_t i, size_t length):
# Don't count appostrophes as punct if the next char is a letter # Don't count appostrophes as punct if the next char is a letter
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
return i == 0 return i == 0
@ -160,69 +84,46 @@ cdef bint check_punct(unicode word, size_t i, size_t length):
EN = English('en') EN = English('en')
cpdef list tokenize(unicode string): # Thresholds for frequency related flags
"""Tokenize a string. TAG_THRESH = 0.5
LOWER_THRESH = 0.5
The tokenization rules are defined in two places: UPPER_THRESH = 0.3
TITLE_THRESH = 0.9
* The data/en/tokenization table, which handles special cases like contractions;
* The :py:meth:`spacy.en.English.find_split` function, which is used to split off punctuation etc.
Args:
string (unicode): The string to be tokenized.
Returns:
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
"""
return EN.tokenize(string)
cpdef Lexeme lookup(unicode string): # Python-readable flag constants --- can't read an enum from Python
"""Retrieve (or create, if not found) a Lexeme for a string, and return its ID. ALPHA = EN.lexicon.add_flag(orth.is_alpha)
DIGIT = EN.lexicon.add_flag(orth.is_digit)
PUNCT = EN.lexicon.add_flag(orth.is_punct)
SPACE = EN.lexicon.add_flag(orth.is_space)
PUNCT = EN.lexicon.add_flag(orth.is_punct)
ASCII = EN.lexicon.add_flag(orth.is_ascii)
TITLE = EN.lexicon.add_flag(orth.is_title)
LOWER = EN.lexicon.add_flag(orth.is_lower)
UPPER = EN.lexicon.add_flag(orth.is_upper)
Properties of the Lexeme are accessed by passing LexID to the accessor methods. OFT_LOWER = EN.lexicon.add_flag(orth.case_trend('lower', LOWER_THRESH))
Access is cheap/free, as the LexID is the memory address of the Lexeme. OFT_UPPER = EN.lexicon.add_flag(orth.case_trend('upper', UPPER_THRESH))
OFT_TITLE = EN.lexicon.add_flag(orth.case_trend('title', TITLE_THRESH))
Args:
string (unicode): The string to be looked up. Must be unicode, not bytes.
Returns: CAN_PUNCT = EN.lexicon.add_flag(orth.can_tag("PUNCT", TAG_THRESH))
lexeme (LexID): A reference to a lexical type. CAN_CONJ = EN.lexicon.add_flag(orth.can_tag("CONJ", TAG_THRESH))
""" CAN_NUM = EN.lexicon.add_flag(orth.can_tag("NUM", TAG_THRESH))
return EN.lookup(string) CAN_N = EN.lexicon.add_flag(orth.can_tag("N", TAG_THRESH))
CAN_DET = EN.lexicon.add_flag(orth.can_tag("DET", TAG_THRESH))
CAN_ADP = EN.lexicon.add_flag(orth.can_tag("ADP", TAG_THRESH))
CAN_ADJ = EN.lexicon.add_flag(orth.can_tag("ADJ", TAG_THRESH))
CAN_ADV = EN.lexicon.add_flag(orth.can_tag("ADV", TAG_THRESH))
CAN_VERB = EN.lexicon.add_flag(orth.can_tag("VERB", TAG_THRESH))
CAN_NOUN = EN.lexicon.add_flag(orth.can_tag("NOUN", TAG_THRESH))
CAN_PDT = EN.lexicon.add_flag(orth.can_tag("PDT", TAG_THRESH))
CAN_POS = EN.lexicon.add_flag(orth.can_tag("POS", TAG_THRESH))
CAN_PRON = EN.lexicon.add_flag(orth.can_tag("PRON", TAG_THRESH))
CAN_PRT = EN.lexicon.add_flag(orth.can_tag("PRT", TAG_THRESH))
def add_string_views(view_funcs): # These are the name of string transforms
"""Add a string view to existing and previous lexical entries. SIC = EN.lexicon.add_transform(orth.sic_string)
CANON_CASED = EN.lexicon.add_transform(orth.canon_case)
Args: SHAPE = EN.lexicon.add_transform(orth.word_shape)
get_view (function): A unicode --> unicode function. NON_SPARSE = EN.lexicon.add_transform(orth.non_sparse)
Returns:
view_id (int): An integer key you can use to access the view.
"""
pass
def load_clusters(location):
"""Load cluster data.
"""
pass
def load_unigram_probs(location):
"""Load unigram probabilities.
"""
pass
def load_case_stats(location):
"""Load case stats.
"""
pass
def load_tag_stats(location):
"""Load tag statistics.
"""
pass

View File

@ -3,18 +3,23 @@ from libc.stdint cimport uint64_t
from spacy.word cimport Lexeme from spacy.word cimport Lexeme
cdef class Lexicon:
cdef public list flag_checkers
cdef public list string_transformers
cdef dict lexicon
cpdef Lexeme lookup(self, unicode string)
cdef class Language: cdef class Language:
cdef object name cdef object name
cdef dict blobs cdef dict cache
cdef dict lexicon cpdef readonly Lexicon lexicon
cpdef list tokenize(self, unicode text) cpdef list tokenize(self, unicode text)
cdef Word lookup(self, unicode string) cdef list _tokenize(self, unicode string)
cdef list lookup_chunk(self, unicode chunk) cpdef list _split(self, unicode string)
cpdef int _split_one(self, unicode word)
cdef list new_chunk(self, unicode string, list substrings)
cdef Word new_lexeme(self, unicode lex)
cpdef list find_substrings(self, unicode chunk)
cdef int find_split(self, unicode word)

View File

@ -6,37 +6,37 @@ Provides the main implementation for the spacy tokenizer. Specific languages
subclass the Language class, over-writing the tokenization rules as necessary. subclass the Language class, over-writing the tokenization rules as necessary.
Special-case tokenization rules are read from data/<lang>/tokenization . Special-case tokenization rules are read from data/<lang>/tokenization .
""" """
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
from . import util from . import util
import json
from os import path from os import path
cdef class Language: cdef class Language:
view_funcs = []
def __cinit__(self, name): def __cinit__(self, name):
self.name = name self.name = name
self.blobs = {} self.cache = {}
self.lexicon = {} self.lexicon = Lexicon()
self.load_tokenization(util.read_tokenization(name)) self.load_tokenization(util.read_tokenization(name))
self.load_dist_info(util.read_dist_info(name))
cpdef list tokenize(self, unicode string): cpdef list tokenize(self, unicode string):
"""Tokenize. """Tokenize a string.
Split the string into tokens. The tokenization rules are defined in two places:
* The data/<lang>/tokenization table, which handles special cases like contractions;
* The appropriate :py:meth:`find_split` function, which is used to split
off punctuation etc.
Args: Args:
string (unicode): The string to split. string (unicode): The string to be tokenized.
Returns: Returns:
tokens (list): A list of Lexeme objects. tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
""" """
cdef list blob
cdef list tokens = [] cdef list tokens = []
cdef size_t length = len(string) cdef size_t length = len(string)
cdef size_t start = 0 cdef size_t start = 0
@ -44,74 +44,28 @@ cdef class Language:
for c in string: for c in string:
if c == ' ': if c == ' ':
if start < i: if start < i:
blob = self.lookup_blob(string[start:i]) tokens.extend(self._tokenize(string[start:i]))
tokens.extend(blob)
start = i + 1 start = i + 1
i += 1 i += 1
if start < i: if start < i:
chunk = self.lookup_blob(string[start:]) tokens.extend(self._tokenize(string[start:]))
tokens.extend(chunk)
return tokens return tokens
cdef Lexeme lookup(self, unicode string): cdef list _tokenize(self, unicode string):
assert len(string) != 0 if string in self.cache:
cdef Word word return self.cache[string]
if string in self.vocab: cdef list lexemes = []
word = self.vocab[string] substrings = self._split(string)
else:
word = self.new_lexeme(string)
return word
cdef list lookup_blob(self, unicode string):
cdef list chunk
cdef size_t blob_id
if string in self.blobs:
blob = self.blobs[string]
else:
blob = self.new_blob(string, self.find_substrings(string))
return chunk
cdef list new_blob(self, unicode string, list substrings):
blob = []
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
blob.append(self.lookup(substring)) lexemes.append(self.lookup(substring))
self.blobs[string] = chunk self.cache[string] = lexemes
return blob return lexemes
cdef Word new_lexeme(self, unicode string): cpdef list _split(self, unicode string):
# TODO """Find how to split a contiguous span of non-space characters into substrings.
#lexeme = Lexeme(string.encode('utf8'), string_views)
#return lexeme
"""
def add_view_funcs(self, list view_funcs):
self.view_funcs.extend(view_funcs)
cdef size_t nr_views = len(self.view_funcs)
cdef unicode view
cdef StringHash hashed
cdef StringHash key
cdef unicode string
cdef LexID lex_id
cdef Lexeme* word
for key, lex_id in self.vocab.items():
word = <Lexeme*>lex_id
free(word.string_views)
word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash))
string = word.string[:word.length].decode('utf8')
for i, view_func in enumerate(self.view_funcs):
view = view_func(string)
hashed = hash(view)
word.string_views[i] = hashed
self.bacov[hashed] = view
"""
cpdef list find_substrings(self, unicode blob):
"""Find how to split a chunk into substrings.
This method calls find_split repeatedly. Most languages will want to This method calls find_split repeatedly. Most languages will want to
override find_split, but it may be useful to override this instead. override _split_one, but it may be useful to override this instead.
Args: Args:
chunk (unicode): The string to be split, e.g. u"Mike's!" chunk (unicode): The string to be split, e.g. u"Mike's!"
@ -120,22 +74,22 @@ cdef class Language:
substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"]. substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
""" """
substrings = [] substrings = []
while blob: while string:
split = self.find_split(blob) split = self._split_one(string)
if split == 0: if split == 0:
substrings.append(blob) substrings.append(string)
break break
substrings.append(blob[:split]) substrings.append(string[:split])
blob = blob[split:] string = string[split:]
return substrings return substrings
cdef int find_split(self, unicode word): cpdef int _split_one(self, unicode word):
return len(word) return len(word)
def load_tokenization(self, token_rules): def load_special_tokenization(self, token_rules):
'''Load special-case tokenization rules. '''Load special-case tokenization rules.
Loads special-case tokenization rules into the Language.chunk cache, Loads special-case tokenization rules into the Language.cache cache,
read from data/<lang>/tokenization . The special cases are loaded before read from data/<lang>/tokenization . The special cases are loaded before
any language data is tokenized, giving these priority. For instance, any language data is tokenized, giving these priority. For instance,
the English tokenization rules map "ain't" to ["are", "not"]. the English tokenization rules map "ain't" to ["are", "not"].
@ -144,25 +98,83 @@ cdef class Language:
token_rules (list): A list of (chunk, tokens) pairs, where chunk is token_rules (list): A list of (chunk, tokens) pairs, where chunk is
a string and tokens is a list of strings. a string and tokens is a list of strings.
''' '''
for chunk, tokens in token_rules: for string, substrings in token_rules:
self.new_chunk(chunk, tokens) lexemes = []
for i, substring in enumerate(substrings):
lexemes.append(self.lookup(substring))
self.cache[string] = lexemes
def load_dist_info(self, dist_info): cdef class Lexicon:
'''Load distributional information for the known lexemes of the language. def __cinit__(self):
self.flag_checkers = []
self.string_transforms = []
self.lexicon = {}
The distributional information is read from data/<lang>/dist_info.json . cpdef Lexeme lookup(self, unicode string):
It contains information like the (smoothed) unigram log probability of """Retrieve (or create, if not found) a Lexeme for a string, and return it.
the word, how often the word is found upper-cased, how often the word
is found title-cased, etc. Args:
''' string (unicode): The string to be looked up. Must be unicode, not bytes.
Returns:
lexeme (Lexeme): A reference to a lexical type.
"""
assert len(string) != 0
if string in self.lexicon:
return self.lexicon[string]
prob = _pop_default(self.probs, string, 0.0)
cluster = _pop_default(self.clusters, string, 0.0)
case_stats = _pop_default(self.case_stats, string, {})
tag_stats = _pop_default(self.tag_stats, string, {})
cdef Lexeme word = Lexeme(string, prob, cluster, case_stats, tag_stats,
self.flag_checkers, self.string_transformers)
self.lexicon[string] = word
return word
def add_flag(self, flag_checker):
cdef unicode string cdef unicode string
cdef dict word_dist cdef Lexeme word
cdef Word w flag_id = len(self.flag_checkers)
for string, word_dist in dist_info.items(): for string, word in self.lexicon.items():
w = self.lookup(string) if flag_checker(string, word.prob, {}):
w.prob = word_dist.prob word.set_flag(flag_id)
w.cluster = word_dist.cluster self.flag_checkers.append(flag_checker)
for flag in word_dist.flags: return flag_id
w.dist_flags |= DIST_FLAGS[flag]
for tag in word_dist.tagdict: def add_transform(self, string_transform):
w.possible_tags |= TAGS[tag] self.string_transformers.append(string_transform)
return len(self.string_transformers) - 1
def load_probs(self, location):
"""Load unigram probabilities.
"""
self.probs = json.load(location)
cdef Lexeme word
cdef unicode string
for string, word in self.lexicon.items():
prob = _pop_default(self.probs, string, 0.0)
word.prob = prob
def load_clusters(self, location):
self.probs = json.load(location)
cdef Lexeme word
cdef unicode string
for string, word in self.lexicon.items():
cluster = _pop_default(self.cluster, string, 0)
word.cluster = cluster
def load_stats(self, location):
"""Load distributional stats.
"""
raise NotImplementedError
def _pop_default(dict d, key, default):
return d.pop(key) if key in d else default

View File

@ -1,54 +0,0 @@
import os
from os import path
import codecs
import json
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
def utf8open(loc, mode='r'):
return codecs.open(loc, mode, 'utf8')
def load_case_stats(data_dir):
case_loc = path.join(data_dir, 'case')
case_stats = {}
with utf8open(case_loc) as cases_file:
for line in cases_file:
word, upper, title = line.split()
case_stats[word] = (float(upper), float(title))
return case_stats
def read_dist_info(lang):
dist_path = path.join(DATA_DIR, lang, 'distribution_info.json')
if path.exists(dist_path):
with open(dist_path) as file_:
dist_info = json.load(file_)
else:
dist_info = {}
return dist_info
def read_tokenization(lang):
loc = path.join(DATA_DIR, lang, 'tokenization')
entries = []
seen = set()
with utf8open(loc) as file_:
for line in file_:
line = line.strip()
if line.startswith('#'):
continue
if not line:
continue
pieces = line.split()
chunk = pieces.pop(0)
assert chunk not in seen, chunk
seen.add(chunk)
entries.append((chunk, list(pieces)))
if chunk[0].isalpha() and chunk[0].islower():
chunk = chunk[0].title() + chunk[1:]
pieces[0] = pieces[0][0].title() + pieces[0][1:]
seen.add(chunk)
entries.append((chunk, pieces))
return entries

View File

@ -7,19 +7,19 @@ DEF MAX_FLAG = 64
cdef class Lexeme: cdef class Lexeme:
# NB: the readonly keyword refers to _Python_ access. The attributes are # NB: the readonly keyword refers to _Python_ access. The attributes are
# writeable from Cython. # writeable from Cython.
cdef readonly id_t id cpdef readonly id_t id
cdef readonly size_t length cpdef readonly size_t length
cdef readonly double prob cpdef readonly double prob
cdef readonly size_t cluster cpdef readonly size_t cluster
cdef readonly utf8_t* strings cdef utf8_t* views
cdef readonly size_t nr_strings cdef size_t nr_views
cdef readonly flag_t flags cdef readonly flag_t flags
cpdef bint check_flag(self, size_t flag_id) except * cpdef bint check_flag(self, size_t flag_id) except *
cpdef int set_flag(self, size_t flag_id) except -1 cpdef int set_flag(self, size_t flag_id) except -1
cpdef unicode get_string(self, size_t i) except * cpdef unicode get_view_string(self, size_t i)
cpdef id_t get_id(self, size_t i) except 0 cpdef id_t get_view_id(self, size_t i) except 0
cpdef int add_strings(self, list strings) except -1 cpdef int add_view(self, unicode view) except -1

View File

@ -2,10 +2,7 @@
# cython: embedsignature=True # cython: embedsignature=True
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free, realloc
from spacy cimport flags
cdef class Lexeme: cdef class Lexeme:
"""A lexical type. """A lexical type.
@ -53,7 +50,7 @@ cdef class Lexeme:
the same cluster ID as "pineapple", which is not what we'd like. the same cluster ID as "pineapple", which is not what we'd like.
""" """
def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0, def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0,
cluster=0, orth_flags=0, dist_flags=0, possible_tags=0): flags=0):
self.id = <id_t>&string self.id = <id_t>&string
self.length = length self.length = length
self.nr_strings = 0 self.nr_strings = 0
@ -66,25 +63,21 @@ cdef class Lexeme:
def __get__(self): def __get__(self):
return self.strings[0].decode('utf8') return self.strings[0].decode('utf8')
cpdef unicode get_view_string(self, size_t i) except *: cpdef unicode get_view_string(self, size_t i):
assert i < self.nr_strings assert i < self.nr_strings
return self.strings[i].decode('utf8') return self.strings[i].decode('utf8')
cpdef intptr_t get_view_id(self, size_t i) except 0: cpdef id_t get_view_id(self, size_t i) except 0:
assert i < self.nr_strings assert i < self.nr_strings
return <string_id_t>&self.views[i] return <id_t>&self.views[i]
cpdef int add_views(self, list views) except -1: cpdef int add_view(self, unicode view) except -1:
self.nr_views += len(strings) self.nr_views += 1
self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t)) self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t))
cdef unicode view cdef bytes utf8_string = view.encode('utf8')
cdef bytes utf8_string # Intern strings, allowing pointer comparison
for i, view in enumerate(strings): utf8_string = intern(utf8_string)
view = string_views[i] self.views[self.nr_views - 1] = utf8_string
utf8_string = view.encode('utf8')
# Intern strings, allowing pointer comparison
utf8_string = intern(utf8_string)
self.views[i] = utf8_string
cpdef bint check_flag(self, size_t flag_id) except *: cpdef bint check_flag(self, size_t flag_id) except *:
"""Access the value of one of the pre-computed boolean distribution features. """Access the value of one of the pre-computed boolean distribution features.
@ -92,154 +85,7 @@ cdef class Lexeme:
Meanings depend on the language-specific distributional features being loaded. Meanings depend on the language-specific distributional features being loaded.
The suggested features for latin-alphabet languages are: TODO The suggested features for latin-alphabet languages are: TODO
""" """
assert flag_id < flags.MAX_FLAG
return self.flags & (1 << flag_id) return self.flags & (1 << flag_id)
cpdef int set_flag(self, size_t flag_id) except -1: cpdef int set_flag(self, size_t flag_id) except -1:
assert flag_id < flags.MAX_FLAG
self.flags |= (1 << flag_id) self.flags |= (1 << flag_id)
#
#cdef class CasedWord(Word):
# def __cinit__(self, bytes string, list views):
# Word.__cinit__(self, string, string_views)
#
# cpdef bint is_often_uppered(self) except *:
# '''Check the OFT_UPPER distributional flag for the word.
#
# The OFT_UPPER flag records whether a lower-cased version of the word
# is found in all-upper case frequently in a large sample of text, where
# "frequently" is defined as P >= 0.95 (chosen for high mutual information for
# POS tagging).
#
# Case statistics are estimated from a large text corpus. Estimates are read
# from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
#
# >>> is_often_uppered(lookup(u'nato'))
# True
# >>> is_often_uppered(lookup(u'the'))
# False
# '''
# return self.dist_flags & (1 << OFT_UPPER)
#
#
# cpdef bint is_often_titled(self) except *:
# '''Check the OFT_TITLE distributional flag for the word.
#
# The OFT_TITLE flag records whether a lower-cased version of the word
# is found title-cased (see string.istitle) frequently in a large sample of text,
# where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
# POS tagging).
#
# Case statistics are estimated from a large text corpus. Estimates are read
# from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
#
# >>> is_oft_upper(lookup(u'john'))
# True
# >>> is_oft_upper(lookup(u'Bill'))
# False
# '''
# return self.dist_flags & (1 << OFT_TITLE)
#
#
# cpdef bint is_alpha(self) except *:
# """Check whether all characters in the word's string are alphabetic.
#
# Should match the :py:func:`unicode.isalpha()` function.
#
# >>> is_alpha(lookup(u'Hello'))
# True
# >>> is_alpha(lookup(u'العرب'))
# True
# >>> is_alpha(lookup(u'10'))
# False
# """
# return self.orth_flags & 1 << IS_ALPHA
#
# cpdef bint is_digit(self) except *:
# """Check whether all characters in the word's string are numeric.
#
# Should match the :py:func:`unicode.isdigit()` function.
#
# >>> is_digit(lookup(u'10'))
# True
# >>> is_digit(lookup(u''))
# True
# >>> is_digit(lookup(u'one'))
# False
# """
# return self.orth_flags & 1 << IS_DIGIT
#
# cpdef bint is_punct(self) except *:
# """Check whether all characters belong to a punctuation unicode data category
# for a Lexeme ID.
#
# >>> is_punct(lookup(u'.'))
# True
# >>> is_punct(lookup(u'⁒'))
# True
# >>> is_punct(lookup(u' '))
# False
# """
# return self.orth_flags & 1 << IS_PUNCT
#
# cpdef bint is_space(self) except *:
# """Give the result of unicode.isspace() for a Lexeme ID.
#
# >>> is_space(lookup(u'\\t'))
# True
# >>> is_space(lookup(u'<unicode space>'))
# True
# >>> is_space(lookup(u'Hi\\n'))
# False
# """
# return self.orth_flags & 1 << IS_SPACE
#
# cpdef bint is_lower(self) except *:
# """Give the result of unicode.islower() for a Lexeme ID.
#
# >>> is_lower(lookup(u'hi'))
# True
# >>> is_lower(lookup(<unicode>))
# True
# >>> is_lower(lookup(u'10'))
# False
# """
# return self.orth_flags & 1 << IS_LOWER
#
# cpdef bint is_upper(self) except *:
# """Give the result of unicode.isupper() for a Lexeme ID.
#
# >>> is_upper(lookup(u'HI'))
# True
# >>> is_upper(lookup(u'H10'))
# True
# >>> is_upper(lookup(u'10'))
# False
# """
# return self.orth_flags & 1 << IS_UPPER
#
# cpdef bint is_title(self) except *:
# """Give the result of unicode.istitle() for a Lexeme ID.
#
# >>> is_title(lookup(u'Hi'))
# True
# >>> is_title(lookup(u'Hi1'))
# True
# >>> is_title(lookup(u'1'))
# False
# """
# return self.orth_flags & 1 << IS_TITLE
#
# cpdef bint is_ascii(self) except *:
# """Give the result of checking whether all characters in the string are ascii.
#
# >>> is_ascii(lookup(u'Hi'))
# True
# >>> is_ascii(lookup(u' '))
# True
# >>> is_title(lookup(u'<unicode>'))
# False
# """
# return self.orth_flags & 1 << IS_ASCII