* Make flag-setting a language-specific thing

2025-09-19 02:22:43 +03:00 · 2014-12-03 11:04:00 +11:00 · 2014-12-03 11:04:00 +11:00 · b463a7eb86
commit b463a7eb86
parent 71b009e323
6 changed files with 224 additions and 146 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,6 +1,32 @@
 from spacy.lang cimport Language
 from spacy.tokens cimport Tokens
 # Flags
 cpdef enum FlagID:
    IS_ALPHA
    IS_ASCII
    IS_DIGIT
    IS_LOWER
    IS_PUNCT
    IS_SPACE
    IS_TITLE
    IS_UPPER
    LIKE_URL
    LIKE_NUMBER
    OFT_LOWER
    OFT_TITLE
    OFT_UPPER
    IN_MALES
    IN_FEMALES
    IN_SURNAMES
    IN_PLACES
    IN_GAMES
    IN_CELEBS
    IN_NAMES
 cdef class English(Language):
    pass
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -38,6 +38,8 @@ provides a fully Penn Treebank 3-compliant tokenizer.
 from __future__ import unicode_literals
 cimport lang
 from .typedefs cimport flags_t
 import orth
 cdef class English(Language):
@ -47,7 +49,20 @@ cdef class English(Language):
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """
-    pass
+    def set_flags(self, unicode string):
        cdef flags_t flags = 0
        flags |= orth.is_alpha(string) << IS_ALPHA
        flags |= orth.is_ascii(string) << IS_ASCII
        flags |= orth.is_digit(string) << IS_DIGIT
        flags |= orth.is_lower(string) << IS_LOWER
        flags |= orth.is_punct(string) << IS_PUNCT
        flags |= orth.is_space(string) << IS_SPACE
        flags |= orth.is_title(string) << IS_TITLE
        flags |= orth.is_upper(string) << IS_UPPER
        flags |= orth.like_url(string) << LIKE_URL
        flags |= orth.like_number(string) << LIKE_NUMBER
        return flags
 EN = English('en')
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -8,23 +8,17 @@ from cymem.cymem cimport Pool
 from .typedefs cimport hash_t
 from .tokens cimport Tokens
 from .lexeme cimport Lexeme
-from .tagger cimport Tagger
+from .utf8string cimport StringStore, UniStr
 from .utf8string cimport StringStore
 cdef struct String:
    Py_UNICODE* chars
    size_t n
    hash_t key
 cdef class Lexicon:
    cpdef public set_flags
    cdef Pool mem
    cpdef readonly size_t size
    cpdef readonly StringStore strings
    cdef vector[Lexeme*] lexemes
-    cdef Lexeme* get(self, String* s) except NULL
+    cdef Lexeme* get(self, UniStr* s) except NULL
    cdef PreshMap _map
@ -43,10 +37,10 @@ cdef class Language:
    cpdef Tokens tokens_from_list(self, list strings)
    cpdef Tokens tokenize(self, unicode text)
-    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
+    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
-    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                             vector[Lexeme*] *suffixes) except NULL
-    cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
+    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
                            vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -19,6 +19,8 @@ from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport init as lexeme_init
 from .utf8string cimport slice_unicode
 from . import util
 from .util import read_lang_data
 from .tokens import Tokens
@ -34,7 +36,7 @@ cdef class Language:
        self._prefix_re = re.compile(prefix)
        self._suffix_re = re.compile(suffix)
        self._infix_re = re.compile(infix)
-        self.lexicon = Lexicon()
+        self.lexicon = Lexicon(self.set_flags)
        if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
            self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
            self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
@ -45,11 +47,11 @@ cdef class Language:
        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
        if length == 0:
            return tokens
-        cdef String string_struct
+        cdef UniStr string_struct
        cdef unicode py_string
        cdef int idx = 0
        for i, py_string in enumerate(strings):
-            string_from_unicode(&string_struct, py_string)
+            slice_unicode(&string_struct, py_string, 0, len(py_string))
            tokens.push_back(idx, self.lexicon.get(&string_struct))
            idx += len(py_string) + 1
        return tokens
@ -77,11 +79,11 @@ cdef class Language:
        cdef int start = 0
        cdef Py_UNICODE* chars = string
        cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
-        cdef String span
+        cdef UniStr span
        for i in range(1, length):
            if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                if start < i:
-                    string_slice(&span, chars, start, i)
+                    slice_unicode(&span, chars, start, i)
                    lexemes = <Lexeme**>self._cache.get(span.key)
                    if lexemes != NULL:
                        tokens.extend(start, lexemes, 0)
@ -93,7 +95,7 @@ cdef class Language:
                    start += 1
        i += 1
        if start < i:
-            string_slice(&span, chars, start, i)
+            slice_unicode(&span, chars, start, i)
            lexemes = <Lexeme**>self._cache.get(span.key)
            if lexemes != NULL:
                tokens.extend(start, lexemes, 0)
@ -101,7 +103,7 @@ cdef class Language:
                self._tokenize(tokens, &span, start, i)
        return tokens
-    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
+    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
        cdef vector[Lexeme*] prefixes
        cdef vector[Lexeme*] suffixes
        cdef hash_t orig_key
@ -112,20 +114,20 @@ cdef class Language:
        self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
        self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
-    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                                vector[Lexeme*] *suffixes) except NULL:
        cdef size_t i
-        cdef String prefix
+        cdef UniStr prefix
-        cdef String suffix
+        cdef UniStr suffix
-        cdef String minus_pre
+        cdef UniStr minus_pre
-        cdef String minus_suf
+        cdef UniStr minus_suf
        cdef size_t last_size = 0
        while string.n != 0 and string.n != last_size:
            last_size = string.n
            pre_len = self._find_prefix(string.chars, string.n)
            if pre_len != 0:
-                string_slice(&prefix, string.chars, 0, pre_len)
+                slice_unicode(&prefix, string.chars, 0, pre_len)
-                string_slice(&minus_pre, string.chars, pre_len, string.n)
+                slice_unicode(&minus_pre, string.chars, pre_len, string.n)
                # Check whether we've hit a special-case
                if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
                    string[0] = minus_pre
@ -133,15 +135,15 @@ cdef class Language:
                    break
            suf_len = self._find_suffix(string.chars, string.n)
            if suf_len != 0:
-                string_slice(&suffix, string.chars, string.n - suf_len, string.n)
+                slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
-                string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
+                slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
                # Check whether we've hit a special-case
                if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
                    string[0] = minus_suf
                    suffixes.push_back(self.lexicon.get(&suffix))
                    break
            if pre_len and suf_len and (pre_len + suf_len) <= string.n:
-                string_slice(string, string.chars, pre_len, string.n - suf_len)
+                slice_unicode(string, string.chars, pre_len, string.n - suf_len)
                prefixes.push_back(self.lexicon.get(&prefix))
                suffixes.push_back(self.lexicon.get(&suffix))
            elif pre_len:
@ -155,13 +157,13 @@ cdef class Language:
        return string
    cdef int _attach_tokens(self, Tokens tokens,
-                            int idx, String* string,
+                            int idx, UniStr* string,
                            vector[Lexeme*] *prefixes,
                            vector[Lexeme*] *suffixes) except -1:
        cdef int split
        cdef Lexeme** lexemes
        cdef Lexeme* lexeme
-        cdef String span
+        cdef UniStr span
        if prefixes.size():
            idx = tokens.extend(idx, prefixes.data(), prefixes.size())
        if string.n != 0:
@ -174,11 +176,11 @@ cdef class Language:
                if split == 0 or split == -1:
                    idx = tokens.push_back(idx, self.lexicon.get(string))
                else:
-                    string_slice(&span, string.chars, 0, split)
+                    slice_unicode(&span, string.chars, 0, split)
                    idx = tokens.push_back(idx, self.lexicon.get(&span))
-                    string_slice(&span, string.chars, split, split+1)
+                    slice_unicode(&span, string.chars, split, split+1)
                    idx = tokens.push_back(idx, self.lexicon.get(&span))
-                    string_slice(&span, string.chars, split + 1, string.n)
+                    slice_unicode(&span, string.chars, split + 1, string.n)
                    idx = tokens.push_back(idx, self.lexicon.get(&span))
        cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
        while it != suffixes.rend():
@ -222,14 +224,14 @@ cdef class Language:
        '''
        cdef Lexeme** lexemes
        cdef hash_t hashed
-        cdef String string
+        cdef UniStr string
        for uni_string, substrings in token_rules:
            lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
            for i, substring in enumerate(substrings):
-                string_from_unicode(&string, substring)
+                slice_unicode(&string, substring, 0, len(substring))
                lexemes[i] = <Lexeme*>self.lexicon.get(&string)
            lexemes[i + 1] = NULL
-            string_from_unicode(&string, uni_string)
+            slice_unicode(&string, uni_string, 0, len(uni_string))
            self._specials.set(string.key, lexemes)
            self._cache.set(string.key, lexemes)
@ -239,21 +241,23 @@ cdef class Lexicon:
    Also interns UTF-8 strings, and maps them to consecutive integer IDs.
    '''
-    def __init__(self):
+    def __init__(self, object set_flags=None):
        self.mem = Pool()
        self._map = PreshMap(2 ** 20)
        self.strings = StringStore()
        self.lexemes.push_back(&EMPTY_LEXEME)
        self.size = 1
        self.set_flags = set_flags
-    cdef Lexeme* get(self, String* string) except NULL:
+    cdef Lexeme* get(self, UniStr* string) except NULL:
        '''Retrieve a pointer to a Lexeme from the lexicon.'''
        cdef Lexeme* lex
        lex = <Lexeme*>self._map.get(string.key)
        if lex != NULL:
            return lex
        lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
-        lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {})
+        lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
                self.strings, {'flags': self.set_flags(string.chars[:string.n])})
        self._map.set(string.key, lex)
        while self.lexemes.size() < (lex.id + 1):
            self.lexemes.push_back(&EMPTY_LEXEME)
@ -283,14 +287,14 @@ cdef class Lexicon:
        '''
        if type(id_or_string) == int:
            return self.lexemes.at(id_or_string)[0]
-        cdef String string
+        cdef UniStr string
-        string_from_unicode(&string, id_or_string)
+        slice_unicode(&string, id_or_string, 0, len(id_or_string))
        cdef Lexeme* lexeme = self.get(&string)
        return lexeme[0]
    def __setitem__(self, unicode uni_string, dict props):
-        cdef String s
+        cdef UniStr s
-        string_from_unicode(&s, uni_string)
+        slice_unicode(&s, uni_string, 0, len(uni_string))
        cdef Lexeme* lex = self.get(&s)
        lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
@ -338,14 +342,3 @@ cdef class Lexicon:
            i += 1
            self.size += 1
        fclose(fp)
 cdef void string_from_unicode(String* s, unicode uni):
    cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
    string_slice(s, c_uni, 0, len(uni))
 cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
    s.chars = &chars[start]
    s.n = end - start
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,61 +1,119 @@
-from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
+from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
 from .utf8string cimport StringStore
 from libc.stdint cimport uint16_t
 cpdef flag_t OOV_DIST_FLAGS
-# Flags
+# Reserve 64 values for flag features
-cpdef enum:
+cpdef enum attr_id_t:
-    IS_ALPHA
+    FLAG0
-    IS_ASCII
+    FLAG1
-    IS_DIGIT
+    FLAG2
-    IS_LOWER
+    FLAG3
-    IS_PUNCT
+    FLAG4
-    IS_SPACE
+    FLAG5
-    IS_TITLE
+    FLAG6
-    IS_UPPER
+    FLAG7
    FLAG8
    FLAG9
    FLAG10
    FLAG11
    FLAG12
    FLAG13
    FLAG14
    FLAG15
    FLAG16
    FLAG17
    FLAG18
    FLAG19
    FLAG20
    FLAG21
    FLAG22
    FLAG23
    FLAG24
    FLAG25
    FLAG26
    FLAG27
    FLAG28
    FLAG29
    FLAG30
    FLAG31
    FLAG32
    FLAG33
    FLAG34
    FLAG35
    FLAG36
    FLAG37
    FLAG38
    FLAG39
    FLAG40
    FLAG41
    FLAG42
    FLAG43
    FLAG44
    FLAG45
    FLAG46
    FLAG47
    FLAG48
    FLAG49
    FLAG50
    FLAG51
    FLAG52
    FLAG53
    FLAG54
    FLAG55
    FLAG56
    FLAG57
    FLAG58
    FLAG59
    FLAG60
    FLAG61
    FLAG62
    FLAG63
-    LIKE_URL
+    ID
-    LIKE_NUMBER
+    SIC
    NORM
    SHAPE
    ASCIIED
    PREFIX
    SUFFIX
-    OFT_LOWER
+    LENGTH
-    OFT_TITLE
+    CLUSTER
-    OFT_UPPER
+    POS_TYPE
-
+    SENSE_TYPE
    IN_MALES
    IN_FEMALES
    IN_SURNAMES
    IN_PLACES
    IN_GAMES
    IN_CELEBS
    IN_NAMES
 cdef struct Lexeme:
-    flag_t flags
+    flags_t flags
-    id_t id
+    attr_t id
-    id_t sic
+    attr_t sic
-    id_t norm
+    attr_t norm
-    id_t shape
+    attr_t shape
-    id_t asciied
+    attr_t asciied
-    id_t prefix
+    attr_t prefix
-    id_t suffix
+    attr_t suffix
    attr_t length
    attr_t cluster
    attr_t pos_type
    attr_t sense_type
    float prob
-    
+    float upper_pc
-    len_t length
+    float title_pc
    tag_t cluster
    tag_t postype
    tag_t supersense
 cdef Lexeme EMPTY_LEXEME
-cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
+
-                  StringStore store, dict props) except *
+cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
                  dict props) except *
-cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
+cdef inline bint check_flag(Lexeme* lexeme, attr_id_t flag_id) nogil:
    return lexeme.flags & (1 << flag_id)
 cdef attr_t get_attr(Lexeme* lex, attr_id_t attr_id)
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -6,67 +6,59 @@ from libc.string cimport memset
 import orth
 from .utf8string cimport Utf8Str
 OOV_DIST_FLAGS = 0
 memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
 def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
    cdef flag_t flags = 0
    flags |= orth.is_alpha(string) << IS_ALPHA
    flags |= orth.is_ascii(string) << IS_ASCII
    flags |= orth.is_digit(string) << IS_DIGIT
    flags |= orth.is_lower(string) << IS_LOWER
    flags |= orth.is_punct(string) << IS_PUNCT
    flags |= orth.is_space(string) << IS_SPACE
    flags |= orth.is_title(string) << IS_TITLE
    flags |= orth.is_upper(string) << IS_UPPER
    flags |= orth.like_url(string) << LIKE_URL
    flags |= orth.like_number(string) << LIKE_NUMBER
    return flags
 cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
-                  StringStore store, dict props) except *:
+                  StringStore string_store, dict props) except *:
    cdef Lexeme lex
    lex.id = i
    lex.length = len(string)
-    lex.sic = get_string_id(string, store)
+    lex.sic = string_store[string]
    lex.cluster = props.get('cluster', 0)
-    lex.postype = props.get('postype', 0)
+    lex.pos_type = props.get('pos_type', 0)
-    lex.supersense = props.get('supersense', 0)
+    lex.sense_type = props.get('sense_type', 0)
    lex.prob = props.get('prob', 0)
-    cdef float upper_pc = props.get('upper_pc', 0.0)
+    lex.upper_pc = props.get('upper_pc', 0.0)
-    cdef float lower_pc = props.get('lower_pc', 0.0)
+    lex.title_pc = props.get('lower_pc', 0.0)
    cdef float title_pc = props.get('title_pc', 0.0)
-    lex.prefix = get_string_id(string[0], store)
+    lex.prefix = string_store[string[:1]]
-    lex.suffix = get_string_id(string[-3:], store)
+    lex.suffix = string_store[string[-3:]]
-    if upper_pc or lower_pc or title_pc:
+    lex.norm = lex.sic # TODO
-        canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
+    lex.shape = string_store[orth.word_shape(string)]
-        lex.norm = get_string_id(canon_cased, store)
+    lex.asciied = string_store[orth.asciied(string)]
    else:
        lex.norm = lex.sic
    lex.shape = get_string_id(orth.word_shape(string), store)
    lex.asciied = get_string_id(orth.asciied(string), store)
    lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
-    lex.flags |= props.get('in_males', 0) << IN_MALES
+    lex.flags = props.get('flags', 0)
    lex.flags |= props.get('in_females', 0) << IN_FEMALES
    lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES
    lex.flags |= props.get('in_places', 0) << IN_PLACES
    lex.flags |= props.get('in_celebs', 0) << IN_CELEBS
    lex.flags |= props.get('in_games', 0) << IN_GAMES
    lex.flags |= props.get('in_names', 0) << IN_NAMES
    return lex
-cdef id_t get_string_id(unicode string, StringStore store) except 0:
+cdef attr_t get_attr(Lexeme* lex, attr_id_t feat_name):
-    cdef bytes byte_string = string.encode('utf8')
+    if feat_name < (sizeof(flags_t) * 8):
-    cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
+        return check_flag(lex, feat_name)
-    return orig_str.i
+    elif feat_name == ID:
        return lex.id
    elif feat_name == SIC:
        return lex.sic
    elif feat_name == NORM:
        return lex.norm
    elif feat_name == SHAPE:
        return lex.shape
    elif feat_name == ASCIIED:
        return lex.asciied
    elif feat_name == PREFIX:
        return lex.prefix
    elif feat_name == SUFFIX:
        return lex.suffix
    elif feat_name == LENGTH:
        return lex.length
    elif feat_name == CLUSTER:
        return lex.cluster
    elif feat_name == POS_TYPE:
        return lex.pos_type
    elif feat_name == SENSE_TYPE:
        return lex.sense_type
    else:
        raise StandardError('Feature ID: %d not found' % feat_name)