mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
Add add_flag method to Vocab, re Issue #504.
This commit is contained in:
parent
f3be9d0a9a
commit
2cc515b2ed
|
@ -21,3 +21,22 @@ def test_is_digit(en_vocab):
|
||||||
assert year.flags & (1 << IS_DIGIT)
|
assert year.flags & (1 << IS_DIGIT)
|
||||||
mixed = en_vocab['hello1']
|
mixed = en_vocab['hello1']
|
||||||
assert not mixed.flags & (1 << IS_DIGIT)
|
assert not mixed.flags & (1 << IS_DIGIT)
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_flag_auto_id(en_vocab):
|
||||||
|
is_len4 = en_vocab.add_flag(lambda string: len(string) == 4)
|
||||||
|
assert en_vocab['1999'].check_flag(is_len4) == True
|
||||||
|
assert en_vocab['1999'].check_flag(IS_DIGIT) == True
|
||||||
|
assert en_vocab['199'].check_flag(is_len4) == False
|
||||||
|
assert en_vocab['199'].check_flag(IS_DIGIT) == True
|
||||||
|
assert en_vocab['the'].check_flag(is_len4) == False
|
||||||
|
assert en_vocab['dogs'].check_flag(is_len4) == True
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_flag_provided_id(en_vocab):
|
||||||
|
is_len4 = en_vocab.add_flag(lambda string: len(string) == 4, flag_id=IS_DIGIT)
|
||||||
|
assert en_vocab['1999'].check_flag(is_len4) == True
|
||||||
|
assert en_vocab['199'].check_flag(is_len4) == False
|
||||||
|
assert en_vocab['199'].check_flag(IS_DIGIT) == False
|
||||||
|
assert en_vocab['the'].check_flag(is_len4) == False
|
||||||
|
assert en_vocab['dogs'].check_flag(is_len4) == True
|
||||||
|
|
|
@ -129,6 +129,44 @@ cdef class Vocab:
|
||||||
"""The current number of lexemes stored."""
|
"""The current number of lexemes stored."""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
|
def add_flag(self, flag_getter, int flag_id=-1):
|
||||||
|
'''Set a new boolean flag to words in the vocabulary. The flag_setter
|
||||||
|
function will be called over the words currently in the vocab, and then
|
||||||
|
applied to new words as they occur. You'll then be able to access the
|
||||||
|
flag value on each token, using token.check_flag(flag_id). See also:
|
||||||
|
Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
flag_getter:
|
||||||
|
A function f(unicode) -> bool, to get the flag value.
|
||||||
|
|
||||||
|
flag_id (int):
|
||||||
|
An integer between 1 and 63 (inclusive), specifying the bit at which the
|
||||||
|
flag will be stored. If -1, the lowest available bit will be
|
||||||
|
chosen.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
flag_id (int): The integer ID by which the flag value can be checked.
|
||||||
|
'''
|
||||||
|
if flag_id == -1:
|
||||||
|
for bit in range(1, 64):
|
||||||
|
if bit not in self.lex_attr_getters:
|
||||||
|
flag_id = bit
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Cannot find empty bit for new lexical flag. All bits between "
|
||||||
|
"0 and 63 are occupied. You can replace one by specifying the "
|
||||||
|
"flag_id explicitly, e.g. nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA")
|
||||||
|
elif flag_id >= 64 or flag_id < 1:
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid value for flag_id: %d. Flag IDs must be between "
|
||||||
|
"1 and 63 (inclusive)" % flag_id)
|
||||||
|
for lex in self:
|
||||||
|
lex.set_flag(flag_id, flag_getter(lex.orth_))
|
||||||
|
self.lex_attr_getters[flag_id] = flag_getter
|
||||||
|
return flag_id
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||||
if necessary, using memory acquired from the given pool. If the pool
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
|
|
Loading…
Reference in New Issue
Block a user