* Make flag-setting a language-specific thing

2025-11-08 20:07:51 +03:00 · 2014-12-03 11:04:00 +11:00 · 2014-12-03 11:04:00 +11:00 · b463a7eb86
commit b463a7eb86
parent 71b009e323
6 changed files with 224 additions and 146 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,6 +1,32 @@
 from spacy.lang cimport Language
 from spacy.tokens cimport Tokens

+# Flags
+cpdef enum FlagID:
+    IS_ALPHA
+    IS_ASCII
+    IS_DIGIT
+    IS_LOWER
+    IS_PUNCT
+    IS_SPACE
+    IS_TITLE
+    IS_UPPER
+
+    LIKE_URL
+    LIKE_NUMBER
+
+    OFT_LOWER
+    OFT_TITLE
+    OFT_UPPER
+
+    IN_MALES
+    IN_FEMALES
+    IN_SURNAMES
+    IN_PLACES
+    IN_GAMES
+    IN_CELEBS
+    IN_NAMES
+

 cdef class English(Language):
    pass
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -38,6 +38,8 @@ provides a fully Penn Treebank 3-compliant tokenizer.
 from __future__ import unicode_literals

 cimport lang
+from .typedefs cimport flags_t
+import orth


 cdef class English(Language):
@ -47,7 +49,20 @@ cdef class English(Language):
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """
-    pass
+    def set_flags(self, unicode string):
+        cdef flags_t flags = 0
+        flags |= orth.is_alpha(string) << IS_ALPHA
+        flags |= orth.is_ascii(string) << IS_ASCII
+        flags |= orth.is_digit(string) << IS_DIGIT
+        flags |= orth.is_lower(string) << IS_LOWER
+        flags |= orth.is_punct(string) << IS_PUNCT
+        flags |= orth.is_space(string) << IS_SPACE
+        flags |= orth.is_title(string) << IS_TITLE
+        flags |= orth.is_upper(string) << IS_UPPER
+
+        flags |= orth.like_url(string) << LIKE_URL
+        flags |= orth.like_number(string) << LIKE_NUMBER
+        return flags


 EN = English('en')
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -8,23 +8,17 @@ from cymem.cymem cimport Pool
 from .typedefs cimport hash_t
 from .tokens cimport Tokens
 from .lexeme cimport Lexeme
-from .tagger cimport Tagger
-from .utf8string cimport StringStore
-
-
-cdef struct String:
-    Py_UNICODE* chars
-    size_t n
-    hash_t key
+from .utf8string cimport StringStore, UniStr


 cdef class Lexicon:
+    cpdef public set_flags
    cdef Pool mem
    cpdef readonly size_t size
    cpdef readonly StringStore strings
    cdef vector[Lexeme*] lexemes

-    cdef Lexeme* get(self, String* s) except NULL
+    cdef Lexeme* get(self, UniStr* s) except NULL
    
    cdef PreshMap _map
    
@ -43,10 +37,10 @@ cdef class Language:
    cpdef Tokens tokens_from_list(self, list strings)
    cpdef Tokens tokenize(self, unicode text)

-    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
-    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                             vector[Lexeme*] *suffixes) except NULL
-    cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
+    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
                            vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -19,6 +19,8 @@ from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport init as lexeme_init

+from .utf8string cimport slice_unicode
+
 from . import util
 from .util import read_lang_data
 from .tokens import Tokens
@ -34,7 +36,7 @@ cdef class Language:
        self._prefix_re = re.compile(prefix)
        self._suffix_re = re.compile(suffix)
        self._infix_re = re.compile(infix)
-        self.lexicon = Lexicon()
+        self.lexicon = Lexicon(self.set_flags)
        if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
            self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
            self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
@ -45,11 +47,11 @@ cdef class Language:
        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
        if length == 0:
            return tokens
-        cdef String string_struct
+        cdef UniStr string_struct
        cdef unicode py_string
        cdef int idx = 0
        for i, py_string in enumerate(strings):
-            string_from_unicode(&string_struct, py_string)
+            slice_unicode(&string_struct, py_string, 0, len(py_string))
            tokens.push_back(idx, self.lexicon.get(&string_struct))
            idx += len(py_string) + 1
        return tokens
@ -77,11 +79,11 @@ cdef class Language:
        cdef int start = 0
        cdef Py_UNICODE* chars = string
        cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
-        cdef String span
+        cdef UniStr span
        for i in range(1, length):
            if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                if start < i:
-                    string_slice(&span, chars, start, i)
+                    slice_unicode(&span, chars, start, i)
                    lexemes = <Lexeme**>self._cache.get(span.key)
                    if lexemes != NULL:
                        tokens.extend(start, lexemes, 0)
@ -93,7 +95,7 @@ cdef class Language:
                    start += 1
        i += 1
        if start < i:
-            string_slice(&span, chars, start, i)
+            slice_unicode(&span, chars, start, i)
            lexemes = <Lexeme**>self._cache.get(span.key)
            if lexemes != NULL:
                tokens.extend(start, lexemes, 0)
@ -101,7 +103,7 @@ cdef class Language:
                self._tokenize(tokens, &span, start, i)
        return tokens

-    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
+    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
        cdef vector[Lexeme*] prefixes
        cdef vector[Lexeme*] suffixes
        cdef hash_t orig_key
@ -112,20 +114,20 @@ cdef class Language:
        self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
        self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)

-    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                                vector[Lexeme*] *suffixes) except NULL:
        cdef size_t i
-        cdef String prefix
-        cdef String suffix
-        cdef String minus_pre
-        cdef String minus_suf
+        cdef UniStr prefix
+        cdef UniStr suffix
+        cdef UniStr minus_pre
+        cdef UniStr minus_suf
        cdef size_t last_size = 0
        while string.n != 0 and string.n != last_size:
            last_size = string.n
            pre_len = self._find_prefix(string.chars, string.n)
            if pre_len != 0:
-                string_slice(&prefix, string.chars, 0, pre_len)
-                string_slice(&minus_pre, string.chars, pre_len, string.n)
+                slice_unicode(&prefix, string.chars, 0, pre_len)
+                slice_unicode(&minus_pre, string.chars, pre_len, string.n)
                # Check whether we've hit a special-case
                if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
                    string[0] = minus_pre
@ -133,15 +135,15 @@ cdef class Language:
                    break
            suf_len = self._find_suffix(string.chars, string.n)
            if suf_len != 0:
-                string_slice(&suffix, string.chars, string.n - suf_len, string.n)
-                string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
+                slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
+                slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
                # Check whether we've hit a special-case
                if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
                    string[0] = minus_suf
                    suffixes.push_back(self.lexicon.get(&suffix))
                    break
            if pre_len and suf_len and (pre_len + suf_len) <= string.n:
-                string_slice(string, string.chars, pre_len, string.n - suf_len)
+                slice_unicode(string, string.chars, pre_len, string.n - suf_len)
                prefixes.push_back(self.lexicon.get(&prefix))
                suffixes.push_back(self.lexicon.get(&suffix))
            elif pre_len:
@ -155,13 +157,13 @@ cdef class Language:
        return string

    cdef int _attach_tokens(self, Tokens tokens,
-                            int idx, String* string,
+                            int idx, UniStr* string,
                            vector[Lexeme*] *prefixes,
                            vector[Lexeme*] *suffixes) except -1:
        cdef int split
        cdef Lexeme** lexemes
        cdef Lexeme* lexeme
-        cdef String span
+        cdef UniStr span
        if prefixes.size():
            idx = tokens.extend(idx, prefixes.data(), prefixes.size())
        if string.n != 0:
@ -174,11 +176,11 @@ cdef class Language:
                if split == 0 or split == -1:
                    idx = tokens.push_back(idx, self.lexicon.get(string))
                else:
-                    string_slice(&span, string.chars, 0, split)
+                    slice_unicode(&span, string.chars, 0, split)
                    idx = tokens.push_back(idx, self.lexicon.get(&span))
-                    string_slice(&span, string.chars, split, split+1)
+                    slice_unicode(&span, string.chars, split, split+1)
                    idx = tokens.push_back(idx, self.lexicon.get(&span))
-                    string_slice(&span, string.chars, split + 1, string.n)
+                    slice_unicode(&span, string.chars, split + 1, string.n)
                    idx = tokens.push_back(idx, self.lexicon.get(&span))
        cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
        while it != suffixes.rend():
@ -222,14 +224,14 @@ cdef class Language:
        '''
        cdef Lexeme** lexemes
        cdef hash_t hashed
-        cdef String string
+        cdef UniStr string
        for uni_string, substrings in token_rules:
            lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
            for i, substring in enumerate(substrings):
-                string_from_unicode(&string, substring)
+                slice_unicode(&string, substring, 0, len(substring))
                lexemes[i] = <Lexeme*>self.lexicon.get(&string)
            lexemes[i + 1] = NULL
-            string_from_unicode(&string, uni_string)
+            slice_unicode(&string, uni_string, 0, len(uni_string))
            self._specials.set(string.key, lexemes)
            self._cache.set(string.key, lexemes)

@ -239,21 +241,23 @@ cdef class Lexicon:
    
    Also interns UTF-8 strings, and maps them to consecutive integer IDs.
    '''
-    def __init__(self):
+    def __init__(self, object set_flags=None):
        self.mem = Pool()
        self._map = PreshMap(2 ** 20)
        self.strings = StringStore()
        self.lexemes.push_back(&EMPTY_LEXEME)
        self.size = 1
+        self.set_flags = set_flags

-    cdef Lexeme* get(self, String* string) except NULL:
+    cdef Lexeme* get(self, UniStr* string) except NULL:
        '''Retrieve a pointer to a Lexeme from the lexicon.'''
        cdef Lexeme* lex
        lex = <Lexeme*>self._map.get(string.key)
        if lex != NULL:
            return lex
        lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
-        lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {})
+        lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
+                self.strings, {'flags': self.set_flags(string.chars[:string.n])})
        self._map.set(string.key, lex)
        while self.lexemes.size() < (lex.id + 1):
            self.lexemes.push_back(&EMPTY_LEXEME)
@ -283,14 +287,14 @@ cdef class Lexicon:
        '''
        if type(id_or_string) == int:
            return self.lexemes.at(id_or_string)[0]
-        cdef String string
-        string_from_unicode(&string, id_or_string)
+        cdef UniStr string
+        slice_unicode(&string, id_or_string, 0, len(id_or_string))
        cdef Lexeme* lexeme = self.get(&string)
        return lexeme[0]

    def __setitem__(self, unicode uni_string, dict props):
-        cdef String s
-        string_from_unicode(&s, uni_string)
+        cdef UniStr s
+        slice_unicode(&s, uni_string, 0, len(uni_string))
        cdef Lexeme* lex = self.get(&s)
        lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)

@ -338,14 +342,3 @@ cdef class Lexicon:
            i += 1
            self.size += 1
        fclose(fp)
-        
-
-cdef void string_from_unicode(String* s, unicode uni):
-    cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
-    string_slice(s, c_uni, 0, len(uni))
-
-
-cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
-    s.chars = &chars[start]
-    s.n = end - start
-    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,61 +1,119 @@
-from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
+from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t

 from .utf8string cimport StringStore
-from libc.stdint cimport uint16_t

-cpdef flag_t OOV_DIST_FLAGS

-# Flags
-cpdef enum:
-    IS_ALPHA
-    IS_ASCII
-    IS_DIGIT
-    IS_LOWER
-    IS_PUNCT
-    IS_SPACE
-    IS_TITLE
-    IS_UPPER
+# Reserve 64 values for flag features
+cpdef enum attr_id_t:
+    FLAG0
+    FLAG1
+    FLAG2
+    FLAG3
+    FLAG4
+    FLAG5
+    FLAG6
+    FLAG7
+    FLAG8
+    FLAG9
+    FLAG10
+    FLAG11
+    FLAG12
+    FLAG13
+    FLAG14
+    FLAG15
+    FLAG16
+    FLAG17
+    FLAG18
+    FLAG19
+    FLAG20
+    FLAG21
+    FLAG22
+    FLAG23
+    FLAG24
+    FLAG25
+    FLAG26
+    FLAG27
+    FLAG28
+    FLAG29
+    FLAG30
+    FLAG31
+    FLAG32
+    FLAG33
+    FLAG34
+    FLAG35
+    FLAG36
+    FLAG37
+    FLAG38
+    FLAG39
+    FLAG40
+    FLAG41
+    FLAG42
+    FLAG43
+    FLAG44
+    FLAG45
+    FLAG46
+    FLAG47
+    FLAG48
+    FLAG49
+    FLAG50
+    FLAG51
+    FLAG52
+    FLAG53
+    FLAG54
+    FLAG55
+    FLAG56
+    FLAG57
+    FLAG58
+    FLAG59
+    FLAG60
+    FLAG61
+    FLAG62
+    FLAG63

-    LIKE_URL
-    LIKE_NUMBER
+    ID
+    SIC
+    NORM
+    SHAPE
+    ASCIIED
+    PREFIX
+    SUFFIX

-    OFT_LOWER
-    OFT_TITLE
-    OFT_UPPER
-
-    IN_MALES
-    IN_FEMALES
-    IN_SURNAMES
-    IN_PLACES
-    IN_GAMES
-    IN_CELEBS
-    IN_NAMES
+    LENGTH
+    CLUSTER
+    POS_TYPE
+    SENSE_TYPE


 cdef struct Lexeme:
-    flag_t flags
+    flags_t flags
   
-    id_t id
-    id_t sic
-    id_t norm
-    id_t shape
-    id_t asciied
-    id_t prefix
-    id_t suffix
+    attr_t id
+    attr_t sic
+    attr_t norm
+    attr_t shape
+    attr_t asciied
+    attr_t prefix
+    attr_t suffix
+ 
+    attr_t length
+    attr_t cluster
+    attr_t pos_type
+    attr_t sense_type

    float prob
-    
-    len_t length
-    tag_t cluster
-    tag_t postype
-    tag_t supersense
+    float upper_pc
+    float title_pc


 cdef Lexeme EMPTY_LEXEME

-cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
-                  StringStore store, dict props) except *
+
+cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
+                  dict props) except *
 

-cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
+cdef inline bint check_flag(Lexeme* lexeme, attr_id_t flag_id) nogil:
    return lexeme.flags & (1 << flag_id)
+
+
+cdef attr_t get_attr(Lexeme* lex, attr_id_t attr_id)
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -6,67 +6,59 @@ from libc.string cimport memset

 import orth

-from .utf8string cimport Utf8Str
-
-OOV_DIST_FLAGS = 0

 memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))


-def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
-    cdef flag_t flags = 0
-    flags |= orth.is_alpha(string) << IS_ALPHA
-    flags |= orth.is_ascii(string) << IS_ASCII
-    flags |= orth.is_digit(string) << IS_DIGIT
-    flags |= orth.is_lower(string) << IS_LOWER
-    flags |= orth.is_punct(string) << IS_PUNCT
-    flags |= orth.is_space(string) << IS_SPACE
-    flags |= orth.is_title(string) << IS_TITLE
-    flags |= orth.is_upper(string) << IS_UPPER
-
-    flags |= orth.like_url(string) << LIKE_URL
-    flags |= orth.like_number(string) << LIKE_NUMBER
-    return flags
-
-
 cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
-                  StringStore store, dict props) except *:
+                  StringStore string_store, dict props) except *:
    cdef Lexeme lex
    lex.id = i
    lex.length = len(string)
-    lex.sic = get_string_id(string, store)
+    lex.sic = string_store[string]
    
    lex.cluster = props.get('cluster', 0)
-    lex.postype = props.get('postype', 0)
-    lex.supersense = props.get('supersense', 0)
+    lex.pos_type = props.get('pos_type', 0)
+    lex.sense_type = props.get('sense_type', 0)
    lex.prob = props.get('prob', 0)

-    cdef float upper_pc = props.get('upper_pc', 0.0)
-    cdef float lower_pc = props.get('lower_pc', 0.0)
-    cdef float title_pc = props.get('title_pc', 0.0)
+    lex.upper_pc = props.get('upper_pc', 0.0)
+    lex.title_pc = props.get('lower_pc', 0.0)

-    lex.prefix = get_string_id(string[0], store)
-    lex.suffix = get_string_id(string[-3:], store)
-    if upper_pc or lower_pc or title_pc:
-        canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
-        lex.norm = get_string_id(canon_cased, store)
-    else:
-        lex.norm = lex.sic
-    lex.shape = get_string_id(orth.word_shape(string), store)
-    lex.asciied = get_string_id(orth.asciied(string), store)
-    lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
-    
-    lex.flags |= props.get('in_males', 0) << IN_MALES
-    lex.flags |= props.get('in_females', 0) << IN_FEMALES
-    lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES
-    lex.flags |= props.get('in_places', 0) << IN_PLACES
-    lex.flags |= props.get('in_celebs', 0) << IN_CELEBS
-    lex.flags |= props.get('in_games', 0) << IN_GAMES
-    lex.flags |= props.get('in_names', 0) << IN_NAMES
+    lex.prefix = string_store[string[:1]]
+    lex.suffix = string_store[string[-3:]]
+    lex.norm = lex.sic # TODO
+    lex.shape = string_store[orth.word_shape(string)]
+    lex.asciied = string_store[orth.asciied(string)]
+   
+    lex.flags = props.get('flags', 0)
    return lex


-cdef id_t get_string_id(unicode string, StringStore store) except 0:
-    cdef bytes byte_string = string.encode('utf8')
-    cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
-    return orig_str.i
+cdef attr_t get_attr(Lexeme* lex, attr_id_t feat_name):
+    if feat_name < (sizeof(flags_t) * 8):
+        return check_flag(lex, feat_name)
+    elif feat_name == ID:
+        return lex.id
+    elif feat_name == SIC:
+        return lex.sic
+    elif feat_name == NORM:
+        return lex.norm
+    elif feat_name == SHAPE:
+        return lex.shape
+    elif feat_name == ASCIIED:
+        return lex.asciied
+    elif feat_name == PREFIX:
+        return lex.prefix
+    elif feat_name == SUFFIX:
+        return lex.suffix
+    elif feat_name == LENGTH:
+        return lex.length
+    elif feat_name == CLUSTER:
+        return lex.cluster
+    elif feat_name == POS_TYPE:
+        return lex.pos_type
+    elif feat_name == SENSE_TYPE:
+        return lex.sense_type
+    else:
+        raise StandardError('Feature ID: %d not found' % feat_name)