mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
* Fix formatting and whitespace * Add support for lexical attributes (closes #2390) * Document lexical attribute setting during retokenization * Assign variable oputside of nested loop
This commit is contained in:
parent
a48deb4081
commit
62b558ab72
|
@ -289,3 +289,29 @@ def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs
|
|||
with pytest.raises(ValueError):
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[0:2], attrs=attrs)
|
||||
|
||||
|
||||
def test_doc_retokenizer_merge_lex_attrs(en_vocab):
|
||||
"""Test that retokenization also sets attributes on the lexeme if they're
|
||||
lexical attributes. For example, if a user sets IS_STOP, it should mean that
|
||||
"all tokens with that lexeme" are marked as a stop word, so the ambiguity
|
||||
here is acceptable. Also see #2390.
|
||||
"""
|
||||
# Test regular merging
|
||||
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
||||
assert not any(t.is_stop for t in doc)
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[0:2], attrs={"lemma": "hello world", "is_stop": True})
|
||||
assert doc[0].lemma_ == "hello world"
|
||||
assert doc[0].is_stop
|
||||
# Test bulk merging
|
||||
doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
|
||||
assert not any(t.like_num for t in doc)
|
||||
assert not any(t.is_stop for t in doc)
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[0:2], attrs={"like_num": True})
|
||||
retokenizer.merge(doc[2:4], attrs={"is_stop": True})
|
||||
assert doc[0].like_num
|
||||
assert doc[1].is_stop
|
||||
assert not doc[0].is_stop
|
||||
assert not doc[1].like_num
|
||||
|
|
|
@ -165,3 +165,21 @@ def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs
|
|||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[0], 1), doc[1]]
|
||||
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
||||
|
||||
|
||||
def test_doc_retokenizer_split_lex_attrs(en_vocab):
|
||||
"""Test that retokenization also sets attributes on the lexeme if they're
|
||||
lexical attributes. For example, if a user sets IS_STOP, it should mean that
|
||||
"all tokens with that lexeme" are marked as a stop word, so the ambiguity
|
||||
here is acceptable. Also see #2390.
|
||||
"""
|
||||
assert not Doc(en_vocab, words=["Los"])[0].is_stop
|
||||
assert not Doc(en_vocab, words=["Angeles"])[0].is_stop
|
||||
doc = Doc(en_vocab, words=["LosAngeles", "start"])
|
||||
assert not doc[0].is_stop
|
||||
with doc.retokenize() as retokenizer:
|
||||
attrs = {"is_stop": [True, False]}
|
||||
heads = [(doc[0], 1), doc[1]]
|
||||
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
||||
assert doc[0].is_stop
|
||||
assert not doc[1].is_stop
|
||||
|
|
|
@ -66,9 +66,13 @@ cdef class Retokenizer:
|
|||
for extension in extensions:
|
||||
_validate_extensions(extension)
|
||||
attrs = {key: value for key, value in attrs.items() if key != "_"}
|
||||
# NB: Since we support {"KEY": [value, value]} syntax here, this
|
||||
# will only "intify" the keys, not the values
|
||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||
attrs["_"] = extensions
|
||||
else:
|
||||
# NB: Since we support {"KEY": [value, value]} syntax here, this
|
||||
# will only "intify" the keys, not the values
|
||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||
head_offsets = []
|
||||
for head in heads:
|
||||
|
@ -153,7 +157,11 @@ def _merge(Doc doc, int start, int end, attributes):
|
|||
elif attr_name == TAG:
|
||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||
else:
|
||||
# Set attributes on both token and lexeme to take care of token
|
||||
# attribute vs. lexical attribute without having to enumerate them.
|
||||
# If an attribute name is not valid, set_struct_attr will ignore it.
|
||||
Token.set_struct_attr(token, attr_name, attr_value)
|
||||
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
|
||||
# Make sure ent_iob remains consistent
|
||||
if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
|
||||
if token.ent_type == doc.c[end].ent_type:
|
||||
|
@ -216,6 +224,7 @@ def _bulk_merge(Doc doc, merges):
|
|||
"""
|
||||
cdef Span span
|
||||
cdef const LexemeC* lex
|
||||
cdef TokenC* token
|
||||
cdef Pool mem = Pool()
|
||||
tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
|
||||
spans = []
|
||||
|
@ -231,15 +240,6 @@ def _bulk_merge(Doc doc, merges):
|
|||
# House the new merged token where it starts
|
||||
token = &doc.c[start]
|
||||
tokens[merge_index] = token
|
||||
# Assign attributes
|
||||
for attr_name, attr_value in attributes.items():
|
||||
if attr_name == "_": # Set extension attributes
|
||||
for ext_attr_key, ext_attr_value in attr_value.items():
|
||||
doc[start]._.set(ext_attr_key, ext_attr_value)
|
||||
elif attr_name == TAG:
|
||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||
else:
|
||||
Token.set_struct_attr(token, attr_name, attr_value)
|
||||
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
||||
# for the merged region. To do this, we create a boolean array indicating
|
||||
# whether the row is to be deleted, then use numpy.delete
|
||||
|
@ -255,14 +255,30 @@ def _bulk_merge(Doc doc, merges):
|
|||
# We update token.lex after keeping span root and dep, since
|
||||
# setting token.lex will change span.start and span.end properties
|
||||
# as it modifies the character offsets in the doc
|
||||
for token_index in range(len(merges)):
|
||||
for token_index, (span, attributes) in enumerate(merges):
|
||||
new_orth = ''.join([t.text_with_ws for t in spans[token_index]])
|
||||
if spans[token_index][-1].whitespace_:
|
||||
new_orth = new_orth[:-len(spans[token_index][-1].whitespace_)]
|
||||
token = tokens[token_index]
|
||||
lex = doc.vocab.get(doc.mem, new_orth)
|
||||
tokens[token_index].lex = lex
|
||||
token.lex = lex
|
||||
# We set trailing space here too
|
||||
tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy
|
||||
token.spacy = doc.c[spans[token_index].end-1].spacy
|
||||
py_token = span[0]
|
||||
# Assign attributes
|
||||
for attr_name, attr_value in attributes.items():
|
||||
if attr_name == "_": # Set extension attributes
|
||||
for ext_attr_key, ext_attr_value in attr_value.items():
|
||||
py_token._.set(ext_attr_key, ext_attr_value)
|
||||
elif attr_name == TAG:
|
||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||
else:
|
||||
# Set attributes on both token and lexeme to take care of token
|
||||
# attribute vs. lexical attribute without having to enumerate
|
||||
# them. If an attribute name is not valid, set_struct_attr will
|
||||
# ignore it.
|
||||
Token.set_struct_attr(token, attr_name, attr_value)
|
||||
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
|
||||
# Begin by setting all the head indices to absolute token positions
|
||||
# This is easier to work with for now than the offsets
|
||||
# Before thinking of something simpler, beware the case where a
|
||||
|
@ -281,7 +297,7 @@ def _bulk_merge(Doc doc, merges):
|
|||
current_offset = 0
|
||||
for i in range(doc.length):
|
||||
if current_span_index < len(spans) and i == spans[current_span_index].end:
|
||||
#last token was the last of the span
|
||||
# Last token was the last of the span
|
||||
current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1
|
||||
current_span_index += 1
|
||||
if current_span_index < len(spans) and \
|
||||
|
@ -405,10 +421,17 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
|||
if attr_name == "_":
|
||||
for ext_attr_key, ext_attr_value in attr_value.items():
|
||||
doc[token_index + i]._.set(ext_attr_key, ext_attr_value)
|
||||
# NB: We need to call get_string_id here because only the keys are
|
||||
# "intified" (since we support "KEY": [value, value] syntax here).
|
||||
elif attr_name == TAG:
|
||||
doc.vocab.morphology.assign_tag(token, get_string_id(attr_value))
|
||||
else:
|
||||
# Set attributes on both token and lexeme to take care of token
|
||||
# attribute vs. lexical attribute without having to enumerate
|
||||
# them. If an attribute name is not valid, set_struct_attr will
|
||||
# ignore it.
|
||||
Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
|
||||
Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
|
||||
# Assign correct dependencies to the inner token
|
||||
for i, head in enumerate(heads):
|
||||
doc.c[token_index + i].head = head
|
||||
|
|
|
@ -402,9 +402,11 @@ invalidated, although they may accidentally continue to work.
|
|||
|
||||
### Retokenizer.merge {#retokenizer.merge tag="method"}
|
||||
|
||||
Mark a span for merging. The `attrs` will be applied to the resulting token.
|
||||
Writable custom extension attributes can be provided as a dictionary mapping
|
||||
attribute names to values as the `"_"` key.
|
||||
Mark a span for merging. The `attrs` will be applied to the resulting token (if
|
||||
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
|
||||
underlying lexeme (if they're context-independent lexical attributes like
|
||||
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a
|
||||
dictionary mapping attribute names to values as the `"_"` key.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -431,7 +433,10 @@ second subtoken of `doc[3]`. This mechanism allows attaching subtokens to other
|
|||
newly created subtokens, without having to keep track of the changing token
|
||||
indices. If the specified head token will be split within the retokenizer block
|
||||
and no subtoken index is specified, it will default to `0`. Attributes to set on
|
||||
subtokens can be provided as a list of values.
|
||||
subtokens can be provided as a list of values. They'll be applied to the
|
||||
resulting token (if they're context-dependent token attributes like `LEMMA` or
|
||||
`DEP`) or to the underlying lexeme (if they're context-independent lexical
|
||||
attributes like `LOWER` or `IS_STOP`).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -995,6 +995,14 @@ with doc.retokenize() as retokenizer:
|
|||
print("After:", [token.text for token in doc])
|
||||
```
|
||||
|
||||
If an attribute in the `attrs` is a context-dependent token attribute, it will
|
||||
be applied to the underlying [`Token`](/api/token). For example `LEMMA`, `POS`
|
||||
or `DEP` only apply to a word in context, so they're token attributes. If an
|
||||
attribute is a context-independent lexical attribute, it will be applied to the
|
||||
underlying [`Lexeme`](/api/lexeme), the entry in the vocabulary. For example,
|
||||
`LOWER` or `IS_STOP` apply to all words of the same spelling, regardless of the
|
||||
context.
|
||||
|
||||
<Infobox title="Tip: merging entities and noun phrases">
|
||||
|
||||
If you need to merge named entities or noun chunks, check out the built-in
|
||||
|
|
Loading…
Reference in New Issue
Block a user