From e9a62b6eba3203ce6a56ebb0290b26dec767983a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 27 Aug 2014 17:15:39 +0200 Subject: [PATCH] * Refactoring with Lexeme as a class now compiles. Basic design seems to work --- spacy/en.pxd | 14 ++-- spacy/en.pyx | 189 +++++++++++---------------------------------- spacy/lang.pxd | 23 +++--- spacy/lang.pyx | 206 ++++++++++++++++++++++++++----------------------- spacy/util.py | 54 ------------- spacy/word.pxd | 18 ++--- spacy/word.pyx | 176 +++--------------------------------------- 7 files changed, 196 insertions(+), 484 deletions(-) delete mode 100644 spacy/util.py diff --git a/spacy/en.pxd b/spacy/en.pxd index b4c8bf0c8..2c9f4c718 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -1,4 +1,4 @@ -from spacy.spacy cimport Language +from spacy.lang cimport Language from spacy.word cimport Lexeme cimport cython @@ -31,12 +31,14 @@ cpdef size_t POS cpdef size_t PRON cpdef size_t PRT -cdef class English(spacy.Language): - cdef int find_split(self, unicode word) +cpdef size_t SIC +cpdef size_t CANON_CASED +cpdef size_t SHAPE +cpdef size_t NON_SPARSE -cdef English EN +cdef class English(Language): + cpdef int _split_one(self, unicode word) -cpdef Word lookup(unicode word) -cpdef list tokenize(unicode string) +cpdef English EN diff --git a/spacy/en.pyx b/spacy/en.pyx index b16f06ed6..0ee7109c1 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -31,6 +31,7 @@ same scheme. Tokenization problems are a major cause of poor performance for NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module provides a fully Penn Treebank 3-compliant tokenizer. ''' +# TODO #The script translate_treebank_tokenization can be used to transform a treebank's #annotation to use one of the spacy tokenization schemes. @@ -40,90 +41,14 @@ from __future__ import unicode_literals from libc.stdlib cimport malloc, calloc, free from libc.stdint cimport uint64_t -cimport spacy +cimport lang - -# Python-readable flag constants --- can't read an enum from Python - -# Don't want to manually assign these numbers, or we'll insert one and have to -# change them all. -# Don't use "i", as we don't want it in the global scope! -cdef size_t __i = 0 - -ALPHA = __i; i += 1 -DIGIT = __i; __i += 1 -PUNCT = __i; __i += 1 -SPACE = __i; __i += 1 -LOWER = __i; __i += 1 -UPPER = __i; __i += 1 -TITLE = __i; __i += 1 -ASCII = __i; __i += 1 - -OFT_LOWER = __i; __i += 1 -OFT_UPPER = __i; __i += 1 -OFT_TITLE = __i; __i += 1 - -PUNCT = __i; __i += 1 -CONJ = __i; __i += 1 -NUM = __i; __i += 1 -X = __i; __i += 1 -DET = __i; __i += 1 -ADP = __i; __i += 1 -ADJ = __i; __i += 1 -ADV = __i; __i += 1 -VERB = __i; __i += 1 -NOUN = __i; __i += 1 -PDT = __i; __i += 1 -POS = __i; __i += 1 -PRON = __i; __i += 1 -PRT = __i; __i += 1 - - -# These are for the string views -__i = 0 -SIC = __i; __i += 1 -CANON_CASED = __i; __i += 1 -NON_SPARSE = __i; __i += 1 -SHAPE = __i; __i += 1 -NR_STRING_VIEWS = __i - - -def get_string_views(unicode string, lexeme): - views = ['' for _ in range(NR_STRING_VIEWS)] - views[SIC] = string - views[CANON_CASED] = canonicalize_case(string, lexeme) - views[SHAPE] = get_string_shape(string) - views[NON_SPARSE] = get_non_sparse(string, views[CANON_CASED], views[SHAPE], - lexeme) - return views - - -def set_orth_flags(unicode string, flags_t flags) - setters = [ - (ALPHA, is_alpha), - (DIGIT, is_digit), - (PUNCT, is_punct), - (SPACE, is_space), - (LOWER, is_lower), - (UPPER, is_upper), - (SPACE, is_space) - ] - - for bit, setter in setters: - if setter(string): - flags |= 1 << bit - return flags +from spacy import orth - -cdef class English(spacy.Language): - cdef Lexeme new_lexeme(self, unicode string, cluster=0, prob=0, case_stats=None, - tag_freqs=None): - return Lexeme(s, length, views, prob=prob, cluster=cluster, - flags=self.get_flags(string)) - - cdef int find_split(self, unicode word): +cdef class English(Language): + cpdef int _split_one(self, unicode word): cdef size_t length = len(word) cdef int i = 0 if word.startswith("'s") or word.startswith("'S"): @@ -132,17 +57,16 @@ cdef class English(spacy.Language): if word.endswith("'s") and length >= 3: return length - 2 # Leading punctuation - if check_punct(word, 0, length): + if _check_punct(word, 0, length): return 1 elif length >= 1: # Split off all trailing punctuation characters i = 0 - while i < length and not check_punct(word, i, length): + while i < length and not _check_punct(word, i, length): i += 1 return i - -cdef bint check_punct(unicode word, size_t i, size_t length): +cdef bint _check_punct(unicode word, size_t i, size_t length): # Don't count appostrophes as punct if the next char is a letter if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): return i == 0 @@ -160,69 +84,46 @@ cdef bint check_punct(unicode word, size_t i, size_t length): EN = English('en') -cpdef list tokenize(unicode string): - """Tokenize a string. - - The tokenization rules are defined in two places: - - * The data/en/tokenization table, which handles special cases like contractions; - * The :py:meth:`spacy.en.English.find_split` function, which is used to split off punctuation etc. - - Args: - string (unicode): The string to be tokenized. - - Returns: - tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. - """ - return EN.tokenize(string) +# Thresholds for frequency related flags +TAG_THRESH = 0.5 +LOWER_THRESH = 0.5 +UPPER_THRESH = 0.3 +TITLE_THRESH = 0.9 -cpdef Lexeme lookup(unicode string): - """Retrieve (or create, if not found) a Lexeme for a string, and return its ID. +# Python-readable flag constants --- can't read an enum from Python +ALPHA = EN.lexicon.add_flag(orth.is_alpha) +DIGIT = EN.lexicon.add_flag(orth.is_digit) +PUNCT = EN.lexicon.add_flag(orth.is_punct) +SPACE = EN.lexicon.add_flag(orth.is_space) +PUNCT = EN.lexicon.add_flag(orth.is_punct) +ASCII = EN.lexicon.add_flag(orth.is_ascii) +TITLE = EN.lexicon.add_flag(orth.is_title) +LOWER = EN.lexicon.add_flag(orth.is_lower) +UPPER = EN.lexicon.add_flag(orth.is_upper) - Properties of the Lexeme are accessed by passing LexID to the accessor methods. - Access is cheap/free, as the LexID is the memory address of the Lexeme. - - Args: - string (unicode): The string to be looked up. Must be unicode, not bytes. +OFT_LOWER = EN.lexicon.add_flag(orth.case_trend('lower', LOWER_THRESH)) +OFT_UPPER = EN.lexicon.add_flag(orth.case_trend('upper', UPPER_THRESH)) +OFT_TITLE = EN.lexicon.add_flag(orth.case_trend('title', TITLE_THRESH)) - Returns: - lexeme (LexID): A reference to a lexical type. - """ - return EN.lookup(string) +CAN_PUNCT = EN.lexicon.add_flag(orth.can_tag("PUNCT", TAG_THRESH)) +CAN_CONJ = EN.lexicon.add_flag(orth.can_tag("CONJ", TAG_THRESH)) +CAN_NUM = EN.lexicon.add_flag(orth.can_tag("NUM", TAG_THRESH)) +CAN_N = EN.lexicon.add_flag(orth.can_tag("N", TAG_THRESH)) +CAN_DET = EN.lexicon.add_flag(orth.can_tag("DET", TAG_THRESH)) +CAN_ADP = EN.lexicon.add_flag(orth.can_tag("ADP", TAG_THRESH)) +CAN_ADJ = EN.lexicon.add_flag(orth.can_tag("ADJ", TAG_THRESH)) +CAN_ADV = EN.lexicon.add_flag(orth.can_tag("ADV", TAG_THRESH)) +CAN_VERB = EN.lexicon.add_flag(orth.can_tag("VERB", TAG_THRESH)) +CAN_NOUN = EN.lexicon.add_flag(orth.can_tag("NOUN", TAG_THRESH)) +CAN_PDT = EN.lexicon.add_flag(orth.can_tag("PDT", TAG_THRESH)) +CAN_POS = EN.lexicon.add_flag(orth.can_tag("POS", TAG_THRESH)) +CAN_PRON = EN.lexicon.add_flag(orth.can_tag("PRON", TAG_THRESH)) +CAN_PRT = EN.lexicon.add_flag(orth.can_tag("PRT", TAG_THRESH)) -def add_string_views(view_funcs): - """Add a string view to existing and previous lexical entries. - - Args: - get_view (function): A unicode --> unicode function. - - Returns: - view_id (int): An integer key you can use to access the view. - """ - pass - - -def load_clusters(location): - """Load cluster data. - """ - pass - - -def load_unigram_probs(location): - """Load unigram probabilities. - """ - pass - - -def load_case_stats(location): - """Load case stats. - """ - pass - - -def load_tag_stats(location): - """Load tag statistics. - """ - pass +# These are the name of string transforms +SIC = EN.lexicon.add_transform(orth.sic_string) +CANON_CASED = EN.lexicon.add_transform(orth.canon_case) +SHAPE = EN.lexicon.add_transform(orth.word_shape) +NON_SPARSE = EN.lexicon.add_transform(orth.non_sparse) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 259e4f6f8..841e18818 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -3,18 +3,23 @@ from libc.stdint cimport uint64_t from spacy.word cimport Lexeme +cdef class Lexicon: + cdef public list flag_checkers + cdef public list string_transformers + + cdef dict lexicon + + cpdef Lexeme lookup(self, unicode string) + + cdef class Language: cdef object name - cdef dict blobs - cdef dict lexicon + cdef dict cache + cpdef readonly Lexicon lexicon cpdef list tokenize(self, unicode text) - cdef Word lookup(self, unicode string) - cdef list lookup_chunk(self, unicode chunk) + cdef list _tokenize(self, unicode string) + cpdef list _split(self, unicode string) + cpdef int _split_one(self, unicode word) - cdef list new_chunk(self, unicode string, list substrings) - cdef Word new_lexeme(self, unicode lex) - - cpdef list find_substrings(self, unicode chunk) - cdef int find_split(self, unicode word) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 35d84024b..3713e9320 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -6,37 +6,37 @@ Provides the main implementation for the spacy tokenizer. Specific languages subclass the Language class, over-writing the tokenization rules as necessary. Special-case tokenization rules are read from data//tokenization . """ - - from __future__ import unicode_literals from libc.stdlib cimport calloc, free from . import util +import json from os import path cdef class Language: - view_funcs = [] def __cinit__(self, name): self.name = name - self.blobs = {} - self.lexicon = {} + self.cache = {} + self.lexicon = Lexicon() self.load_tokenization(util.read_tokenization(name)) - self.load_dist_info(util.read_dist_info(name)) cpdef list tokenize(self, unicode string): - """Tokenize. + """Tokenize a string. - Split the string into tokens. + The tokenization rules are defined in two places: + + * The data//tokenization table, which handles special cases like contractions; + * The appropriate :py:meth:`find_split` function, which is used to split + off punctuation etc. Args: - string (unicode): The string to split. + string (unicode): The string to be tokenized. Returns: - tokens (list): A list of Lexeme objects. + tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. """ - cdef list blob cdef list tokens = [] cdef size_t length = len(string) cdef size_t start = 0 @@ -44,74 +44,28 @@ cdef class Language: for c in string: if c == ' ': if start < i: - blob = self.lookup_blob(string[start:i]) - tokens.extend(blob) + tokens.extend(self._tokenize(string[start:i])) start = i + 1 i += 1 if start < i: - chunk = self.lookup_blob(string[start:]) - tokens.extend(chunk) + tokens.extend(self._tokenize(string[start:])) return tokens - cdef Lexeme lookup(self, unicode string): - assert len(string) != 0 - cdef Word word - if string in self.vocab: - word = self.vocab[string] - else: - word = self.new_lexeme(string) - return word - - cdef list lookup_blob(self, unicode string): - cdef list chunk - cdef size_t blob_id - if string in self.blobs: - blob = self.blobs[string] - else: - blob = self.new_blob(string, self.find_substrings(string)) - return chunk - - cdef list new_blob(self, unicode string, list substrings): - blob = [] + cdef list _tokenize(self, unicode string): + if string in self.cache: + return self.cache[string] + cdef list lexemes = [] + substrings = self._split(string) for i, substring in enumerate(substrings): - blob.append(self.lookup(substring)) - self.blobs[string] = chunk - return blob + lexemes.append(self.lookup(substring)) + self.cache[string] = lexemes + return lexemes - cdef Word new_lexeme(self, unicode string): - # TODO - #lexeme = Lexeme(string.encode('utf8'), string_views) - #return lexeme - - """ - def add_view_funcs(self, list view_funcs): - self.view_funcs.extend(view_funcs) - cdef size_t nr_views = len(self.view_funcs) - - cdef unicode view - cdef StringHash hashed - cdef StringHash key - cdef unicode string - cdef LexID lex_id - cdef Lexeme* word - - for key, lex_id in self.vocab.items(): - word = lex_id - free(word.string_views) - word.string_views = calloc(nr_views, sizeof(StringHash)) - string = word.string[:word.length].decode('utf8') - for i, view_func in enumerate(self.view_funcs): - view = view_func(string) - hashed = hash(view) - word.string_views[i] = hashed - self.bacov[hashed] = view - """ - - cpdef list find_substrings(self, unicode blob): - """Find how to split a chunk into substrings. + cpdef list _split(self, unicode string): + """Find how to split a contiguous span of non-space characters into substrings. This method calls find_split repeatedly. Most languages will want to - override find_split, but it may be useful to override this instead. + override _split_one, but it may be useful to override this instead. Args: chunk (unicode): The string to be split, e.g. u"Mike's!" @@ -120,22 +74,22 @@ cdef class Language: substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"]. """ substrings = [] - while blob: - split = self.find_split(blob) + while string: + split = self._split_one(string) if split == 0: - substrings.append(blob) + substrings.append(string) break - substrings.append(blob[:split]) - blob = blob[split:] + substrings.append(string[:split]) + string = string[split:] return substrings - cdef int find_split(self, unicode word): + cpdef int _split_one(self, unicode word): return len(word) - def load_tokenization(self, token_rules): + def load_special_tokenization(self, token_rules): '''Load special-case tokenization rules. - Loads special-case tokenization rules into the Language.chunk cache, + Loads special-case tokenization rules into the Language.cache cache, read from data//tokenization . The special cases are loaded before any language data is tokenized, giving these priority. For instance, the English tokenization rules map "ain't" to ["are", "not"]. @@ -144,25 +98,83 @@ cdef class Language: token_rules (list): A list of (chunk, tokens) pairs, where chunk is a string and tokens is a list of strings. ''' - for chunk, tokens in token_rules: - self.new_chunk(chunk, tokens) + for string, substrings in token_rules: + lexemes = [] + for i, substring in enumerate(substrings): + lexemes.append(self.lookup(substring)) + self.cache[string] = lexemes + - def load_dist_info(self, dist_info): - '''Load distributional information for the known lexemes of the language. +cdef class Lexicon: + def __cinit__(self): + self.flag_checkers = [] + self.string_transforms = [] + self.lexicon = {} - The distributional information is read from data//dist_info.json . - It contains information like the (smoothed) unigram log probability of - the word, how often the word is found upper-cased, how often the word - is found title-cased, etc. - ''' + cpdef Lexeme lookup(self, unicode string): + """Retrieve (or create, if not found) a Lexeme for a string, and return it. + + Args: + string (unicode): The string to be looked up. Must be unicode, not bytes. + + Returns: + lexeme (Lexeme): A reference to a lexical type. + """ + assert len(string) != 0 + if string in self.lexicon: + return self.lexicon[string] + + prob = _pop_default(self.probs, string, 0.0) + cluster = _pop_default(self.clusters, string, 0.0) + case_stats = _pop_default(self.case_stats, string, {}) + tag_stats = _pop_default(self.tag_stats, string, {}) + + cdef Lexeme word = Lexeme(string, prob, cluster, case_stats, tag_stats, + self.flag_checkers, self.string_transformers) + self.lexicon[string] = word + return word + + def add_flag(self, flag_checker): cdef unicode string - cdef dict word_dist - cdef Word w - for string, word_dist in dist_info.items(): - w = self.lookup(string) - w.prob = word_dist.prob - w.cluster = word_dist.cluster - for flag in word_dist.flags: - w.dist_flags |= DIST_FLAGS[flag] - for tag in word_dist.tagdict: - w.possible_tags |= TAGS[tag] + cdef Lexeme word + flag_id = len(self.flag_checkers) + for string, word in self.lexicon.items(): + if flag_checker(string, word.prob, {}): + word.set_flag(flag_id) + self.flag_checkers.append(flag_checker) + return flag_id + + def add_transform(self, string_transform): + self.string_transformers.append(string_transform) + return len(self.string_transformers) - 1 + + def load_probs(self, location): + """Load unigram probabilities. + """ + self.probs = json.load(location) + + cdef Lexeme word + cdef unicode string + + for string, word in self.lexicon.items(): + prob = _pop_default(self.probs, string, 0.0) + word.prob = prob + + def load_clusters(self, location): + self.probs = json.load(location) + + cdef Lexeme word + cdef unicode string + + for string, word in self.lexicon.items(): + cluster = _pop_default(self.cluster, string, 0) + word.cluster = cluster + + def load_stats(self, location): + """Load distributional stats. + """ + raise NotImplementedError + + +def _pop_default(dict d, key, default): + return d.pop(key) if key in d else default diff --git a/spacy/util.py b/spacy/util.py deleted file mode 100644 index 0e1d237a6..000000000 --- a/spacy/util.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -from os import path -import codecs -import json - -DATA_DIR = path.join(path.dirname(__file__), '..', 'data') - - -def utf8open(loc, mode='r'): - return codecs.open(loc, mode, 'utf8') - - -def load_case_stats(data_dir): - case_loc = path.join(data_dir, 'case') - case_stats = {} - with utf8open(case_loc) as cases_file: - for line in cases_file: - word, upper, title = line.split() - case_stats[word] = (float(upper), float(title)) - return case_stats - - -def read_dist_info(lang): - dist_path = path.join(DATA_DIR, lang, 'distribution_info.json') - if path.exists(dist_path): - with open(dist_path) as file_: - dist_info = json.load(file_) - else: - dist_info = {} - return dist_info - - -def read_tokenization(lang): - loc = path.join(DATA_DIR, lang, 'tokenization') - entries = [] - seen = set() - with utf8open(loc) as file_: - for line in file_: - line = line.strip() - if line.startswith('#'): - continue - if not line: - continue - pieces = line.split() - chunk = pieces.pop(0) - assert chunk not in seen, chunk - seen.add(chunk) - entries.append((chunk, list(pieces))) - if chunk[0].isalpha() and chunk[0].islower(): - chunk = chunk[0].title() + chunk[1:] - pieces[0] = pieces[0][0].title() + pieces[0][1:] - seen.add(chunk) - entries.append((chunk, pieces)) - return entries diff --git a/spacy/word.pxd b/spacy/word.pxd index 5fabf980d..4e9d416fa 100644 --- a/spacy/word.pxd +++ b/spacy/word.pxd @@ -7,19 +7,19 @@ DEF MAX_FLAG = 64 cdef class Lexeme: # NB: the readonly keyword refers to _Python_ access. The attributes are # writeable from Cython. - cdef readonly id_t id - cdef readonly size_t length - cdef readonly double prob - cdef readonly size_t cluster + cpdef readonly id_t id + cpdef readonly size_t length + cpdef readonly double prob + cpdef readonly size_t cluster - cdef readonly utf8_t* strings - cdef readonly size_t nr_strings + cdef utf8_t* views + cdef size_t nr_views cdef readonly flag_t flags cpdef bint check_flag(self, size_t flag_id) except * cpdef int set_flag(self, size_t flag_id) except -1 - cpdef unicode get_string(self, size_t i) except * - cpdef id_t get_id(self, size_t i) except 0 - cpdef int add_strings(self, list strings) except -1 + cpdef unicode get_view_string(self, size_t i) + cpdef id_t get_view_id(self, size_t i) except 0 + cpdef int add_view(self, unicode view) except -1 diff --git a/spacy/word.pyx b/spacy/word.pyx index f609c0c67..99c0845a3 100644 --- a/spacy/word.pyx +++ b/spacy/word.pyx @@ -2,10 +2,7 @@ # cython: embedsignature=True -from libc.stdlib cimport calloc, free - -from spacy cimport flags - +from libc.stdlib cimport calloc, free, realloc cdef class Lexeme: """A lexical type. @@ -53,7 +50,7 @@ cdef class Lexeme: the same cluster ID as "pineapple", which is not what we'd like. """ def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0, - cluster=0, orth_flags=0, dist_flags=0, possible_tags=0): + flags=0): self.id = &string self.length = length self.nr_strings = 0 @@ -66,25 +63,21 @@ cdef class Lexeme: def __get__(self): return self.strings[0].decode('utf8') - cpdef unicode get_view_string(self, size_t i) except *: + cpdef unicode get_view_string(self, size_t i): assert i < self.nr_strings return self.strings[i].decode('utf8') - cpdef intptr_t get_view_id(self, size_t i) except 0: + cpdef id_t get_view_id(self, size_t i) except 0: assert i < self.nr_strings - return &self.views[i] + return &self.views[i] - cpdef int add_views(self, list views) except -1: - self.nr_views += len(strings) + cpdef int add_view(self, unicode view) except -1: + self.nr_views += 1 self.views = realloc(self.views, self.nr_views * sizeof(utf8_t)) - cdef unicode view - cdef bytes utf8_string - for i, view in enumerate(strings): - view = string_views[i] - utf8_string = view.encode('utf8') - # Intern strings, allowing pointer comparison - utf8_string = intern(utf8_string) - self.views[i] = utf8_string + cdef bytes utf8_string = view.encode('utf8') + # Intern strings, allowing pointer comparison + utf8_string = intern(utf8_string) + self.views[self.nr_views - 1] = utf8_string cpdef bint check_flag(self, size_t flag_id) except *: """Access the value of one of the pre-computed boolean distribution features. @@ -92,154 +85,7 @@ cdef class Lexeme: Meanings depend on the language-specific distributional features being loaded. The suggested features for latin-alphabet languages are: TODO """ - assert flag_id < flags.MAX_FLAG return self.flags & (1 << flag_id) cpdef int set_flag(self, size_t flag_id) except -1: - assert flag_id < flags.MAX_FLAG self.flags |= (1 << flag_id) - - -# -#cdef class CasedWord(Word): -# def __cinit__(self, bytes string, list views): -# Word.__cinit__(self, string, string_views) -# -# cpdef bint is_often_uppered(self) except *: -# '''Check the OFT_UPPER distributional flag for the word. -# -# The OFT_UPPER flag records whether a lower-cased version of the word -# is found in all-upper case frequently in a large sample of text, where -# "frequently" is defined as P >= 0.95 (chosen for high mutual information for -# POS tagging). -# -# Case statistics are estimated from a large text corpus. Estimates are read -# from data/en/case_stats, and can be replaced using spacy.en.load_case_stats. -# -# >>> is_often_uppered(lookup(u'nato')) -# True -# >>> is_often_uppered(lookup(u'the')) -# False -# ''' -# return self.dist_flags & (1 << OFT_UPPER) -# -# -# cpdef bint is_often_titled(self) except *: -# '''Check the OFT_TITLE distributional flag for the word. -# -# The OFT_TITLE flag records whether a lower-cased version of the word -# is found title-cased (see string.istitle) frequently in a large sample of text, -# where "frequently" is defined as P >= 0.3 (chosen for high mutual information for -# POS tagging). -# -# Case statistics are estimated from a large text corpus. Estimates are read -# from data/en/case_stats, and can be replaced using spacy.en.load_case_stats. -# -# >>> is_oft_upper(lookup(u'john')) -# True -# >>> is_oft_upper(lookup(u'Bill')) -# False -# ''' -# return self.dist_flags & (1 << OFT_TITLE) -# -# -# cpdef bint is_alpha(self) except *: -# """Check whether all characters in the word's string are alphabetic. -# -# Should match the :py:func:`unicode.isalpha()` function. -# -# >>> is_alpha(lookup(u'Hello')) -# True -# >>> is_alpha(lookup(u'العرب')) -# True -# >>> is_alpha(lookup(u'10')) -# False -# """ -# return self.orth_flags & 1 << IS_ALPHA -# -# cpdef bint is_digit(self) except *: -# """Check whether all characters in the word's string are numeric. -# -# Should match the :py:func:`unicode.isdigit()` function. -# -# >>> is_digit(lookup(u'10')) -# True -# >>> is_digit(lookup(u'๐')) -# True -# >>> is_digit(lookup(u'one')) -# False -# """ -# return self.orth_flags & 1 << IS_DIGIT -# -# cpdef bint is_punct(self) except *: -# """Check whether all characters belong to a punctuation unicode data category -# for a Lexeme ID. -# -# >>> is_punct(lookup(u'.')) -# True -# >>> is_punct(lookup(u'⁒')) -# True -# >>> is_punct(lookup(u' ')) -# False -# """ -# return self.orth_flags & 1 << IS_PUNCT -# -# cpdef bint is_space(self) except *: -# """Give the result of unicode.isspace() for a Lexeme ID. -# -# >>> is_space(lookup(u'\\t')) -# True -# >>> is_space(lookup(u'')) -# True -# >>> is_space(lookup(u'Hi\\n')) -# False -# """ -# return self.orth_flags & 1 << IS_SPACE -# -# cpdef bint is_lower(self) except *: -# """Give the result of unicode.islower() for a Lexeme ID. -# -# >>> is_lower(lookup(u'hi')) -# True -# >>> is_lower(lookup()) -# True -# >>> is_lower(lookup(u'10')) -# False -# """ -# return self.orth_flags & 1 << IS_LOWER -# -# cpdef bint is_upper(self) except *: -# """Give the result of unicode.isupper() for a Lexeme ID. -# -# >>> is_upper(lookup(u'HI')) -# True -# >>> is_upper(lookup(u'H10')) -# True -# >>> is_upper(lookup(u'10')) -# False -# """ -# return self.orth_flags & 1 << IS_UPPER -# -# cpdef bint is_title(self) except *: -# """Give the result of unicode.istitle() for a Lexeme ID. -# -# >>> is_title(lookup(u'Hi')) -# True -# >>> is_title(lookup(u'Hi1')) -# True -# >>> is_title(lookup(u'1')) -# False -# """ -# return self.orth_flags & 1 << IS_TITLE -# -# cpdef bint is_ascii(self) except *: -# """Give the result of checking whether all characters in the string are ascii. -# -# >>> is_ascii(lookup(u'Hi')) -# True -# >>> is_ascii(lookup(u' ')) -# True -# >>> is_title(lookup(u'')) -# False -# """ -# return self.orth_flags & 1 << IS_ASCII