From 59b41a9fd37a6643c16fe1345df8aacdde176141 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 10 Oct 2014 08:11:31 +1100 Subject: [PATCH] * Switch to new data model, tests passing --- spacy/en.pyx | 6 -- spacy/lang.pyx | 133 +++++----------------------------- spacy/lexeme.pxd | 1 + spacy/lexeme.pyx | 25 ++++--- spacy/orth.py | 40 ++++------- spacy/tokens.pxd | 20 +++--- spacy/tokens.pyx | 138 ++++++++++++------------------------ spacy/word.pxd | 3 +- spacy/word.pyx | 55 +++++--------- tests/test_flag_features.py | 40 +++++------ tests/test_is_punct.py | 6 +- tests/test_lexeme_flags.py | 13 ++-- tests/test_orth.py | 3 +- 13 files changed, 151 insertions(+), 332 deletions(-) diff --git a/spacy/en.pyx b/spacy/en.pyx index a51349116..f29e45c9c 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -37,13 +37,7 @@ provides a fully Penn Treebank 3-compliant tokenizer. from __future__ import unicode_literals -from libc.stdint cimport uint64_t - cimport lang -from spacy.lexeme cimport lexeme_check_flag -from spacy.lexeme cimport lexeme_string_view - -from spacy import orth cdef class English(Language): diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 35d1838b2..73f5d358a 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -15,7 +15,7 @@ import re from .util import read_lang_data from spacy.tokens import Tokens -from spacy.lexeme cimport LexemeC, lexeme_init, lexeme_pack, lexeme_unpack +from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack from murmurhash.mrmr cimport hash64 from cpython.ref cimport Py_INCREF @@ -30,99 +30,11 @@ from spacy import orth from spacy import util -cdef enum Flags: - Flag_IsAlpha - Flag_IsAscii - Flag_IsDigit - Flag_IsLower - Flag_IsPunct - Flag_IsSpace - Flag_IsTitle - Flag_IsUpper - - Flag_CanAdj - Flag_CanAdp - Flag_CanAdv - Flag_CanConj - Flag_CanDet - Flag_CanNoun - Flag_CanNum - Flag_CanPdt - Flag_CanPos - Flag_CanPron - Flag_CanPrt - Flag_CanPunct - Flag_CanVerb - - Flag_OftLower - Flag_OftTitle - Flag_OftUpper - Flag_N - - -cdef enum Views: - View_CanonForm - View_WordShape - View_NonSparse - View_Asciied - View_N - - -# Assign the flag and view functions by enum value. -# This is verbose, but it ensures we don't get nasty order sensitivities. -STRING_VIEW_FUNCS = [None] * View_N -STRING_VIEW_FUNCS[View_CanonForm] = orth.canon_case -STRING_VIEW_FUNCS[View_WordShape] = orth.word_shape -STRING_VIEW_FUNCS[View_NonSparse] = orth.non_sparse -STRING_VIEW_FUNCS[View_Asciied] = orth.asciied - -FLAG_FUNCS = [None] * Flag_N -FLAG_FUNCS[Flag_IsAlpha] = orth.is_alpha -FLAG_FUNCS[Flag_IsAscii] = orth.is_ascii -FLAG_FUNCS[Flag_IsDigit] = orth.is_digit -FLAG_FUNCS[Flag_IsLower] = orth.is_lower -FLAG_FUNCS[Flag_IsPunct] = orth.is_punct -FLAG_FUNCS[Flag_IsSpace] = orth.is_space -FLAG_FUNCS[Flag_IsTitle] = orth.is_title -FLAG_FUNCS[Flag_IsUpper] = orth.is_upper - -FLAG_FUNCS[Flag_CanAdj] = orth.can_tag('ADJ') -FLAG_FUNCS[Flag_CanAdp] = orth.can_tag('ADP') -FLAG_FUNCS[Flag_CanAdv] = orth.can_tag('ADV') -FLAG_FUNCS[Flag_CanConj] = orth.can_tag('CONJ') -FLAG_FUNCS[Flag_CanDet] = orth.can_tag('DET') -FLAG_FUNCS[Flag_CanNoun] = orth.can_tag('NOUN') -FLAG_FUNCS[Flag_CanNum] = orth.can_tag('NUM') -FLAG_FUNCS[Flag_CanPdt] = orth.can_tag('PDT') -FLAG_FUNCS[Flag_CanPos] = orth.can_tag('POS') -FLAG_FUNCS[Flag_CanPron] = orth.can_tag('PRON') -FLAG_FUNCS[Flag_CanPrt] = orth.can_tag('PRT') -FLAG_FUNCS[Flag_CanPunct] = orth.can_tag('PUNCT') -FLAG_FUNCS[Flag_CanVerb] = orth.can_tag('VERB') - -FLAG_FUNCS[Flag_OftLower] = orth.oft_case('lower', 0.7) -FLAG_FUNCS[Flag_OftTitle] = orth.oft_case('title', 0.7) -FLAG_FUNCS[Flag_OftUpper] = orth.oft_case('upper', 0.7) - - cdef class Language: """Base class for language-specific tokenizers. - Most subclasses will override the _split or _split_one methods, which take - a string of non-whitespace characters and output a list of strings. This - function is called by _tokenize, which sits behind a cache and turns the - list of strings into Lexeme objects via the Lexicon. Most languages will not - need to override _tokenize or tokenize. - - The language is supplied a list of boolean functions, used to compute flag - features. These are passed to the language's Lexicon object. - The language's name is used to look up default data-files, found in data//tokenization table, which handles special cases like contractions; - * The appropriate :py:meth:`find_split` function, which is used to split - off punctuation etc. + * The data//prefix file, used to build a regex to split off prefixes; + * The data//suffix file, used to build a regex to split off suffixes. Args: string (unicode): The string to be tokenized. Returns: - tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. + tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes. """ cdef size_t length = len(string) cdef Tokens tokens = Tokens(length) @@ -339,10 +249,8 @@ cdef class Language: cdef class Lexicon: - def __cinit__(self, lexemes, string_features, flag_features): + def __cinit__(self, lexemes): self._mem = Pool() - self._flag_features = flag_features - self._string_features = string_features self._dict = PreshMap(2 ** 20) self.size = 0 cdef String string @@ -351,29 +259,22 @@ cdef class Lexicon: for lexeme_dict in lexemes: string_from_unicode(&string, lexeme_dict['string']) lexeme = self._mem.alloc(1, sizeof(LexemeC)) - lexeme.views = self._mem.alloc(len(string_features), sizeof(char*)) lexeme_unpack(lexeme, lexeme_dict) self._dict.set(string.key, lexeme) self.size += 1 cdef LexemeC* get(self, String* string) except NULL: - cdef LexemeC* lexeme - lexeme = self._dict.get(string.key) - if lexeme != NULL: - return lexeme - - cdef unicode uni_string = string.chars[:string.n] - views = [string_view(uni_string, 0.0, 0, {}, {}) - for string_view in self._string_features] - flags = set() - for i, flag_feature in enumerate(self._flag_features): - if flag_feature(uni_string, 0.0, {}, {}): - flags.add(i) - - lexeme = lexeme_init(self._mem, self.size, uni_string, 0, 0, views, flags) - self._dict.set(string.key, lexeme) + cdef LexemeC* lex + lex = self._dict.get(string.key) + if lex != NULL: + return lex + + lex = self._mem.alloc(1, sizeof(LexemeC)) + cdef unicode unicode_string = string.chars[:string.n] + lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string)) + self._dict.set(string.key, lex) self.size += 1 - return lexeme + return lex cpdef Lexeme lookup(self, unicode uni_string): """Retrieve (or create, if not found) a Lexeme for a string, and return it. diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 09d10d0b7..d7c85619d 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -70,6 +70,7 @@ cdef struct LexemeC: flag_t orth_flags flag_t dist_flags +cpdef dict get_lexeme_dict(size_t i, unicode string) cdef char* intern_and_encode(unicode string, size_t* length) except NULL diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index d09dfb72d..b84ed4a02 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -6,7 +6,7 @@ import orth OOV_DIST_FLAGS = 0 -def get_lexeme_dict(size_t i, unicode string): +cpdef dict get_lexeme_dict(size_t i, unicode string): ints = [None for _ in range(LexInt_N)] ints[LexInt_i] = i ints[LexInt_length] = len(string) @@ -18,13 +18,12 @@ def get_lexeme_dict(size_t i, unicode string): floats[LexFloat_prob] = 0 floats[LexFloat_sentiment] = 0 - cdef size_t length strings = [None for _ in range(LexStr_N)] - strings[LexStr_key] = intern_and_encode(string, &length) + strings[LexStr_key] = string strings[LexStr_casefix] = strings[LexStr_key] - strings[LexStr_shape] = intern_and_encode(orth.word_shape(string), &length) + strings[LexStr_shape] = orth.word_shape(string) strings[LexStr_unsparse] = strings[LexStr_shape] - strings[LexStr_asciied] = intern_and_encode(orth.asciied(string), &length) + strings[LexStr_asciied] = orth.asciied(string) orth_flags = get_orth_flags(string) dist_flags = OOV_DIST_FLAGS @@ -33,8 +32,18 @@ def get_lexeme_dict(size_t i, unicode string): 'orth_flags': orth_flags, 'dist_flags': dist_flags} def get_orth_flags(unicode string): - return 0 + cdef flag_t flags = 0 + flags |= orth.is_ascii(string) << LexOrth_ascii + flags |= orth.is_alpha(string) << LexOrth_alpha + flags |= orth.is_digit(string) << LexOrth_digit + flags |= orth.is_lower(string) << LexOrth_lower + flags |= orth.is_punct(string) << LexOrth_punct + flags |= orth.is_space(string) << LexOrth_space + flags |= orth.is_title(string) << LexOrth_title + flags |= orth.is_upper(string) << LexOrth_upper + + return flags def get_dist_flags(unicode string): return 0 @@ -87,9 +96,9 @@ cdef int lexeme_unpack(LexemeC* lex, dict p) except -1: for i, lex_int in enumerate(p['ints']): lex.ints[i] = lex_int for i, lex_float in enumerate(p['floats']): - lex.ints[i] = lex_int + lex.floats[i] = lex_float cdef size_t _ for i, lex_string in enumerate(p['strings']): lex.strings[i] = intern_and_encode(lex_string, &_) lex.orth_flags = p['orth_flags'] - lex.orth_flags = p['orth_flags'] + lex.dist_flags = p['dist_flags'] diff --git a/spacy/orth.py b/spacy/orth.py index 685de191c..53dbcf863 100644 --- a/spacy/orth.py +++ b/spacy/orth.py @@ -1,6 +1,7 @@ # -*- coding: utf8 -*- from __future__ import unicode_literals import unicodedata +from unidecode import unidecode import math @@ -9,15 +10,15 @@ TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split # Binary string features -def is_alpha(string, prob, case_stats, tag_stats): +def is_alpha(string): return string.isalpha() -def is_digit(string, prob, case_stats, tag_stats): +def is_digit(string): return string.isdigit() -def is_punct(string, prob, case_stats, tag_stats): +def is_punct(string): for c in string: if not unicodedata.category(c).startswith('P'): return False @@ -25,11 +26,11 @@ def is_punct(string, prob, case_stats, tag_stats): return True -def is_space(string, prob, case_stats, tag_stats): +def is_space(string): return string.isspace() -def is_ascii(string, prob, case_stats, tag_stats): +def is_ascii(string): for c in string: if ord(c) >= 128: return False @@ -37,15 +38,15 @@ def is_ascii(string, prob, case_stats, tag_stats): return True -def is_title(string, prob, case_stats, tag_stats): +def is_title(string): return string.istitle() -def is_lower(string, prob, case_stats, tag_stats): +def is_lower(string): return string.islower() -def is_upper(string, prob, case_stats, tag_stats): +def is_upper(string): return string.isupper() @@ -103,7 +104,7 @@ def word_shape(string, *args): def non_sparse(string, prob, cluster, case_stats, tag_stats): - if is_alpha(string, prob, case_stats, tag_stats): + if is_alpha(string): return canon_case(string, prob, cluster, case_stats, tag_stats) elif prob >= math.log(0.0001): return string @@ -112,22 +113,5 @@ def non_sparse(string, prob, cluster, case_stats, tag_stats): def asciied(string, prob=0, cluster=0, case_stats=None, tag_stats=None): - '''"ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.''' - # Snippet from - # http://www.physic.ut.ee/~kkannike/english/prog/python/util/asciify/index.html - # TODO: Rewrite and improve this - lookup_table = { - u'“': '"', - u'”': '"' - } - temp = u'' - for char in string: - if char in lookup_table: - temp += lookup_table[char] - else: - decomp = unicodedata.decomposition(char) - if decomp: # Not an empty string - temp += unichr(int(decomp.split()[0], 16)) - else: - temp += char - return temp + ascii_string = unidecode(string) + return ascii_string.decode('ascii') diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index bff4c7742..b138387bf 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -5,21 +5,17 @@ from libcpp.vector cimport vector cdef class Tokens: cdef vector[LexemeC*] *v - cpdef size_t id(self, size_t i) except 0 cpdef unicode string(self, size_t i) - cpdef double prob(self, size_t i) except 1 - cpdef size_t cluster(self, size_t i) except * - cpdef bint check_flag(self, size_t i, size_t flag_id) except * + cpdef float prob(self, size_t i) except 1 + cpdef int cluster(self, size_t i) except * + cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except * + cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except * cpdef unicode string_view(self, size_t i, size_t view_id) - cpdef size_t canon(self, size_t i) except 0 - cpdef size_t shape(self, size_t i) except 0 - cpdef size_t non_sparse(self, size_t i) except 0 - cpdef size_t asciied(self, size_t i) except 0 - cpdef unicode canon_string(self, size_t i) - cpdef unicode shape_string(self, size_t i) - cpdef unicode non_sparse_string(self, size_t i) - cpdef unicode asciied_string(self, size_t i) + cpdef unicode casefix(self, size_t i) + cpdef unicode shape(self, size_t i) + cpdef unicode unsparse(self, size_t i) + cpdef unicode asciied(self, size_t i) cpdef bint is_alpha(self, size_t i) except * cpdef bint is_ascii(self, size_t i) except * cpdef bint is_digit(self, size_t i) except * diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 64ddf5c29..18f0c1533 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -1,45 +1,7 @@ # cython: profile=True -from spacy.word cimport Lexeme -from spacy.lexeme cimport lexeme_check_flag -from spacy.lexeme cimport lexeme_string_view +from .word cimport Lexeme - -cdef enum Flags: - Flag_IsAlpha - Flag_IsAscii - Flag_IsDigit - Flag_IsLower - Flag_IsPunct - Flag_IsSpace - Flag_IsTitle - Flag_IsUpper - - Flag_CanAdj - Flag_CanAdp - Flag_CanAdv - Flag_CanConj - Flag_CanDet - Flag_CanNoun - Flag_CanNum - Flag_CanPdt - Flag_CanPos - Flag_CanPron - Flag_CanPrt - Flag_CanPunct - Flag_CanVerb - - Flag_OftLower - Flag_OftTitle - Flag_OftUpper - Flag_N - - -cdef enum Views: - View_CanonForm - View_WordShape - View_NonSparse - View_Asciied - View_N +from .lexeme cimport * cdef class Tokens: @@ -79,120 +41,108 @@ cdef class Tokens: self.v.push_back(lexeme._c) cpdef unicode string(self, size_t i): - cdef bytes utf8_string = self.v.at(i).string[:self.v.at(i).length] + cdef bytes utf8_string = self.v.at(i).strings[LexStr_key] cdef unicode string = utf8_string.decode('utf8') return string - cpdef size_t id(self, size_t i) except 0: - return &self.v.at(i).string + cpdef float prob(self, size_t i) except 1: + return self.v.at(i).floats[LexFloat_prob] - cpdef double prob(self, size_t i) except 1: - return self.v.at(i).prob + cpdef int cluster(self, size_t i) except *: + return self.v.at(i).ints[LexInt_cluster] - cpdef size_t cluster(self, size_t i) except *: - return self.v.at(i).cluster + cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *: + return lexeme_check_orth_flag(self.v.at(i), flag_id) - cpdef bint check_flag(self, size_t i, size_t flag_id) except *: - return lexeme_check_flag(self.v.at(i), flag_id) + cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *: + return lexeme_check_dist_flag(self.v.at(i), flag_id) cpdef unicode string_view(self, size_t i, size_t view_id): - return lexeme_string_view(self.v.at(i), view_id) + return lexeme_get_string(self.v.at(i), view_id) # Provide accessor methods for the features supported by the language. # Without these, clients have to use the underlying string_view and check_flag # methods, which requires them to know the IDs. - cpdef unicode canon_string(self, size_t i): - return lexeme_string_view(self.v.at(i), View_CanonForm) + cpdef unicode casefix(self, size_t i): + return lexeme_get_string(self.v.at(i), LexStr_casefix) - cpdef unicode shape_string(self, size_t i): - return lexeme_string_view(self.v.at(i), View_WordShape) + cpdef unicode shape(self, size_t i): + return lexeme_get_string(self.v.at(i), LexStr_shape) - cpdef unicode non_sparse_string(self, size_t i): - return lexeme_string_view(self.v.at(i), View_NonSparse) + cpdef unicode unsparse(self, size_t i): + return lexeme_get_string(self.v.at(i), LexStr_unsparse) - cpdef unicode asciied_string(self, size_t i): - return lexeme_string_view(self.v.at(i), View_Asciied) + cpdef unicode asciied(self, size_t i): + return lexeme_get_string(self.v.at(i), LexStr_asciied) - cpdef size_t canon(self, size_t i) except *: - return id(self.v.at(i).views[View_CanonForm]) - - cpdef size_t shape(self, size_t i) except *: - return id(self.v.at(i).views[View_WordShape]) - - cpdef size_t non_sparse(self, size_t i) except *: - return id(self.v.at(i).views[View_NonSparse]) - - cpdef size_t asciied(self, size_t i) except *: - return id(self.v.at(i).views[View_Asciied]) - cpdef bint is_alpha(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_IsAlpha) + return lexeme_check_orth_flag(self.v.at(i), LexOrth_alpha) cpdef bint is_ascii(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_IsAscii) + return lexeme_check_orth_flag(self.v.at(i), LexOrth_ascii) cpdef bint is_digit(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_IsDigit) + return lexeme_check_orth_flag(self.v.at(i), LexOrth_digit) cpdef bint is_lower(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_IsLower) + return lexeme_check_orth_flag(self.v.at(i), LexOrth_lower) cpdef bint is_punct(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_IsPunct) + return lexeme_check_orth_flag(self.v.at(i), LexOrth_punct) cpdef bint is_space(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_IsSpace) + return lexeme_check_orth_flag(self.v.at(i), LexOrth_space) cpdef bint is_title(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_IsTitle) + return lexeme_check_orth_flag(self.v.at(i), LexOrth_title) cpdef bint is_upper(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_IsUpper) + return lexeme_check_orth_flag(self.v.at(i), LexOrth_upper) cpdef bint can_adj(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_CanAdj) + return lexeme_check_dist_flag(self.v.at(i), LexDist_adj) cpdef bint can_adp(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_CanAdp) + return lexeme_check_dist_flag(self.v.at(i), LexDist_adp) cpdef bint can_adv(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_CanAdv) + return lexeme_check_dist_flag(self.v.at(i), LexDist_adv) cpdef bint can_conj(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_CanConj) + return lexeme_check_dist_flag(self.v.at(i), LexDist_conj) cpdef bint can_det(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_CanDet) + return lexeme_check_dist_flag(self.v.at(i), LexDist_det) cpdef bint can_noun(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_CanNoun) + return lexeme_check_dist_flag(self.v.at(i), LexDist_noun) cpdef bint can_num(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_CanNum) + return lexeme_check_dist_flag(self.v.at(i), LexDist_num) cpdef bint can_pdt(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_CanPdt) + return lexeme_check_dist_flag(self.v.at(i), LexDist_pdt) cpdef bint can_pos(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_CanPos) + return lexeme_check_dist_flag(self.v.at(i), LexDist_pos) cpdef bint can_pron(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_CanPron) + return lexeme_check_dist_flag(self.v.at(i), LexDist_pron) cpdef bint can_prt(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_CanPrt) + return lexeme_check_dist_flag(self.v.at(i), LexDist_prt) cpdef bint can_punct(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_CanPunct) + return lexeme_check_dist_flag(self.v.at(i), LexDist_punct) cpdef bint can_verb(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_CanVerb) + return lexeme_check_dist_flag(self.v.at(i), LexDist_verb) cpdef bint oft_lower(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_OftLower) + return lexeme_check_dist_flag(self.v.at(i), LexDist_lower) cpdef bint oft_title(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_OftTitle) + return lexeme_check_dist_flag(self.v.at(i), LexDist_title) cpdef bint oft_upper(self, size_t i) except *: - return lexeme_check_flag(self.v.at(i), Flag_OftUpper) + return lexeme_check_dist_flag(self.v.at(i), LexDist_upper) diff --git a/spacy/word.pxd b/spacy/word.pxd index 5428d667a..ac9c7bb0e 100644 --- a/spacy/word.pxd +++ b/spacy/word.pxd @@ -7,5 +7,6 @@ DEF MAX_FLAG = 64 cdef class Lexeme: cdef LexemeC* _c - cpdef bint check_flag(self, size_t flag_id) except * + cpdef bint check_orth_flag(self, size_t flag_id) except * + cpdef bint check_dist_flag(self, size_t flag_id) except * cpdef unicode string_view(self, size_t view_id) diff --git a/spacy/word.pyx b/spacy/word.pyx index 745832775..617e8809f 100644 --- a/spacy/word.pyx +++ b/spacy/word.pyx @@ -1,7 +1,10 @@ # cython: profile=True # cython: embedsignature=True -from spacy.lexeme cimport lexeme_check_flag, lexeme_string_view +from .lexeme cimport lexeme_get_string +from .lexeme cimport lexeme_check_orth_flag, lexeme_check_dist_flag + +from .lexeme cimport * cdef class Lexeme: @@ -51,49 +54,27 @@ cdef class Lexeme: property string: def __get__(self): - cdef bytes utf8_string = self._c.string + cdef bytes utf8_string = self._c.strings[LexStr_key] cdef unicode string = utf8_string.decode('utf8') return string property prob: - def __get__(self): return self._c.prob + def __get__(self): + return self._c.floats[LexFloat_prob] + property cluster: - def __get__(self): return self._c.cluster + def __get__(self): + return self._c.ints[LexInt_cluster] + property length: - def __get__(self): return self._c.length + def __get__(self): + return self._c.ints[LexInt_length] - cpdef bint check_flag(self, size_t flag_id) except *: - """Lexemes may store language-specific boolean features in a bit-field, - with values accessed by providing an ID constant to this function. + cpdef bint check_orth_flag(self, size_t flag_id) except *: + return lexeme_check_orth_flag(self._c, flag_id) - The ID constants are exposed as global variables in the language module, - e.g. - - >>> from spacy.en import EN - >>> lexeme = EN.lookup(u'Nasa') - >>> lexeme.check_flag(EN.IS_UPPER) - False - >>> lexeme.check_flag(EN.OFT_UPPER) - True - """ - return lexeme_check_flag(self._c, flag_id) + cpdef bint check_dist_flag(self, size_t flag_id) except *: + return lexeme_check_dist_flag(self._c, flag_id) cpdef unicode string_view(self, size_t view_id): - """Lexemes may store language-specific string-view features, obtained - by transforming the string, possibly in light of distributional information. - The string-view features are accessed by providing an ID constant to this - function. - - The ID constants are exposed as global variables in the language module, - e.g. - - >>> from spacy.en import EN - >>> lexeme = EN.lookup(u'Nasa') - >>> lexeme.string_view(EN.CANON_CASED) - u'NASA' - >>> lexeme.string_view(EN.SHAPE) - u'Xxxx' - >>> lexeme.string_view(EN.NON_SPARSE) - u'Xxxx' - """ - return lexeme_string_view(self._c, view_id) + return lexeme_get_string(self._c, view_id) diff --git a/tests/test_flag_features.py b/tests/test_flag_features.py index 61c013e68..2e20ea0e5 100644 --- a/tests/test_flag_features.py +++ b/tests/test_flag_features.py @@ -16,26 +16,26 @@ def words(): "!d", "\nd"] def test_is_alpha(words): - assert is_alpha(words[0], 0, {}, {}) == False - assert is_alpha(words[1], 0, {}, {}) == False - assert is_alpha(words[2], 0, {}, {}) == False - assert is_alpha(words[3], 0, {}, {}) == True - assert is_alpha(words[4], 0, {}, {}) == True - assert is_alpha(words[5], 0, {}, {}) == False - assert is_alpha(words[6], 0, {}, {}) == False - assert is_alpha(words[7], 0, {}, {}) == False - assert is_alpha(words[8], 0, {}, {}) == False - assert is_alpha(words[9], 0, {}, {}) == False + assert not is_alpha(words[0]) + assert not is_alpha(words[1]) + assert not is_alpha(words[2]) + assert is_alpha(words[3]) + assert is_alpha(words[4]) + assert not is_alpha(words[5]) + assert not is_alpha(words[6]) + assert not is_alpha(words[7]) + assert not is_alpha(words[8]) + assert not is_alpha(words[9]) def test_is_digit(words): - assert is_digit(words[0], 0, {}, {}) == True - assert is_digit(words[1], 0, {}, {}) == False - assert is_digit(words[2], 0, {}, {}) == False - assert is_digit(words[3], 0, {}, {}) == False - assert is_digit(words[4], 0, {}, {}) == False - assert is_digit(words[5], 0, {}, {}) == False - assert is_digit(words[6], 0, {}, {}) == False - assert is_digit(words[7], 0, {}, {}) == False - assert is_digit(words[8], 0, {}, {}) == False - assert is_digit(words[9], 0, {}, {}) == False + assert is_digit(words[0]) + assert not is_digit(words[1]) + assert not is_digit(words[2]) + assert not is_digit(words[3]) + assert not is_digit(words[4]) + assert not is_digit(words[5]) + assert not is_digit(words[6]) + assert not is_digit(words[7]) + assert not is_digit(words[8]) + assert not is_digit(words[9]) diff --git a/tests/test_is_punct.py b/tests/test_is_punct.py index 687f5cf31..242e31212 100644 --- a/tests/test_is_punct.py +++ b/tests/test_is_punct.py @@ -5,12 +5,12 @@ from spacy.orth import is_punct def test_comma(): - assert is_punct(',', 0, {}, {}) == True + assert is_punct(',') def test_space(): - assert is_punct(' ', 0, {}, {}) == False + assert not is_punct(' ') def test_letter(): - assert is_punct('a', 0, {}, {}) == False + assert not is_punct('a') diff --git a/tests/test_lexeme_flags.py b/tests/test_lexeme_flags.py index 4818c33b1..c6ff44757 100644 --- a/tests/test_lexeme_flags.py +++ b/tests/test_lexeme_flags.py @@ -3,23 +3,24 @@ from __future__ import unicode_literals import pytest from spacy.en import * +from spacy.lexeme import * def test_is_alpha(): the = EN.lookup('the') - assert the.check_flag(EN.fl_is_alpha) + assert the.check_orth_flag(LexOrth_alpha) year = EN.lookup('1999') - assert not year.check_flag(EN.fl_is_alpha) + assert not year.check_orth_flag(LexOrth_alpha) mixed = EN.lookup('hello1') - assert not mixed.check_flag(EN.fl_is_alpha) + assert not mixed.check_orth_flag(LexOrth_alpha) def test_is_digit(): the = EN.lookup('the') - assert not the.check_flag(EN.fl_is_digit) + assert not the.check_orth_flag(LexOrth_digit) year = EN.lookup('1999') - assert year.check_flag(EN.fl_is_digit) + assert year.check_orth_flag(LexOrth_digit) mixed = EN.lookup('hello1') - assert not mixed.check_flag(EN.fl_is_digit) + assert not mixed.check_orth_flag(LexOrth_digit) diff --git a/tests/test_orth.py b/tests/test_orth.py index 0840af683..a6be98b05 100644 --- a/tests/test_orth.py +++ b/tests/test_orth.py @@ -4,6 +4,7 @@ import pytest import spacy.word from spacy.en import EN +from spacy.lexeme import * @pytest.fixture @@ -12,7 +13,7 @@ def C3P0(): def test_shape(C3P0): - assert C3P0.string_view(EN.v_shape) == "XdXd" + assert C3P0.string_view(LexStr_shape) == "XdXd" def test_length():