mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Add add_flag method to Vocab, re Issue #504.
This commit is contained in:
parent
f3be9d0a9a
commit
2cc515b2ed
|
@ -21,3 +21,22 @@ def test_is_digit(en_vocab):
|
|||
assert year.flags & (1 << IS_DIGIT)
|
||||
mixed = en_vocab['hello1']
|
||||
assert not mixed.flags & (1 << IS_DIGIT)
|
||||
|
||||
|
||||
def test_add_flag_auto_id(en_vocab):
|
||||
is_len4 = en_vocab.add_flag(lambda string: len(string) == 4)
|
||||
assert en_vocab['1999'].check_flag(is_len4) == True
|
||||
assert en_vocab['1999'].check_flag(IS_DIGIT) == True
|
||||
assert en_vocab['199'].check_flag(is_len4) == False
|
||||
assert en_vocab['199'].check_flag(IS_DIGIT) == True
|
||||
assert en_vocab['the'].check_flag(is_len4) == False
|
||||
assert en_vocab['dogs'].check_flag(is_len4) == True
|
||||
|
||||
|
||||
def test_add_flag_provided_id(en_vocab):
|
||||
is_len4 = en_vocab.add_flag(lambda string: len(string) == 4, flag_id=IS_DIGIT)
|
||||
assert en_vocab['1999'].check_flag(is_len4) == True
|
||||
assert en_vocab['199'].check_flag(is_len4) == False
|
||||
assert en_vocab['199'].check_flag(IS_DIGIT) == False
|
||||
assert en_vocab['the'].check_flag(is_len4) == False
|
||||
assert en_vocab['dogs'].check_flag(is_len4) == True
|
||||
|
|
|
@ -129,6 +129,44 @@ cdef class Vocab:
|
|||
"""The current number of lexemes stored."""
|
||||
return self.length
|
||||
|
||||
def add_flag(self, flag_getter, int flag_id=-1):
|
||||
'''Set a new boolean flag to words in the vocabulary. The flag_setter
|
||||
function will be called over the words currently in the vocab, and then
|
||||
applied to new words as they occur. You'll then be able to access the
|
||||
flag value on each token, using token.check_flag(flag_id). See also:
|
||||
Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
|
||||
|
||||
Arguments:
|
||||
flag_getter:
|
||||
A function f(unicode) -> bool, to get the flag value.
|
||||
|
||||
flag_id (int):
|
||||
An integer between 1 and 63 (inclusive), specifying the bit at which the
|
||||
flag will be stored. If -1, the lowest available bit will be
|
||||
chosen.
|
||||
|
||||
Returns:
|
||||
flag_id (int): The integer ID by which the flag value can be checked.
|
||||
'''
|
||||
if flag_id == -1:
|
||||
for bit in range(1, 64):
|
||||
if bit not in self.lex_attr_getters:
|
||||
flag_id = bit
|
||||
break
|
||||
else:
|
||||
raise ValueError(
|
||||
"Cannot find empty bit for new lexical flag. All bits between "
|
||||
"0 and 63 are occupied. You can replace one by specifying the "
|
||||
"flag_id explicitly, e.g. nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA")
|
||||
elif flag_id >= 64 or flag_id < 1:
|
||||
raise ValueError(
|
||||
"Invalid value for flag_id: %d. Flag IDs must be between "
|
||||
"1 and 63 (inclusive)" % flag_id)
|
||||
for lex in self:
|
||||
lex.set_flag(flag_id, flag_getter(lex.orth_))
|
||||
self.lex_attr_getters[flag_id] = flag_getter
|
||||
return flag_id
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
|
|
Loading…
Reference in New Issue
Block a user