Add add_flag method to Vocab, re Issue #504.

2025-11-09 04:17:53 +03:00 · 2016-10-14 12:15:38 +02:00 · 2016-10-14 12:15:38 +02:00 · 2cc515b2ed
commit 2cc515b2ed
parent f3be9d0a9a
2 changed files with 57 additions and 0 deletions
--- a/spacy/tests/vocab/test_lexeme_flags.py
+++ b/spacy/tests/vocab/test_lexeme_flags.py
@ -21,3 +21,22 @@ def test_is_digit(en_vocab):
    assert year.flags & (1 << IS_DIGIT)
    mixed = en_vocab['hello1']
    assert not mixed.flags & (1 << IS_DIGIT)
 def test_add_flag_auto_id(en_vocab):
    is_len4 = en_vocab.add_flag(lambda string: len(string) == 4)
    assert en_vocab['1999'].check_flag(is_len4) == True
    assert en_vocab['1999'].check_flag(IS_DIGIT) == True
    assert en_vocab['199'].check_flag(is_len4) == False
    assert en_vocab['199'].check_flag(IS_DIGIT) == True
    assert en_vocab['the'].check_flag(is_len4) == False
    assert en_vocab['dogs'].check_flag(is_len4) == True
 def test_add_flag_provided_id(en_vocab):
    is_len4 = en_vocab.add_flag(lambda string: len(string) == 4, flag_id=IS_DIGIT)
    assert en_vocab['1999'].check_flag(is_len4) == True
    assert en_vocab['199'].check_flag(is_len4) == False
    assert en_vocab['199'].check_flag(IS_DIGIT) == False
    assert en_vocab['the'].check_flag(is_len4) == False
    assert en_vocab['dogs'].check_flag(is_len4) == True
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -129,6 +129,44 @@ cdef class Vocab:
        """The current number of lexemes stored."""
        return self.length
    def add_flag(self, flag_getter, int flag_id=-1):
        '''Set a new boolean flag to words in the vocabulary. The flag_setter
        function will be called over the words currently in the vocab, and then
        applied to new words as they occur. You'll then be able to access the
        flag value on each token, using token.check_flag(flag_id). See also:
        Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
        Arguments:
            flag_getter:
                A function f(unicode) -> bool, to get the flag value.
            flag_id (int):
                An integer between 1 and 63 (inclusive), specifying the bit at which the
                flag will be stored. If -1, the lowest available bit will be 
                chosen.
        Returns:
            flag_id (int): The integer ID by which the flag value can be checked.
        '''
        if flag_id == -1:
            for bit in range(1, 64):
                if bit not in self.lex_attr_getters:
                    flag_id = bit
                    break
            else:
                raise ValueError(
                    "Cannot find empty bit for new lexical flag. All bits between "
                    "0 and 63 are occupied. You can replace one by specifying the "
                    "flag_id explicitly, e.g. nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA")
        elif flag_id >= 64 or flag_id < 1:
            raise ValueError(
                "Invalid value for flag_id: %d. Flag IDs must be between "
                "1 and 63 (inclusive)" % flag_id)
        for lex in self:
            lex.set_flag(flag_id, flag_getter(lex.orth_))
        self.lex_attr_getters[flag_id] = flag_getter
        return flag_id
    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
        '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
        if necessary, using memory acquired from the given pool.  If the pool