mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
* Fix formatting and whitespace * Add support for lexical attributes (closes #2390) * Document lexical attribute setting during retokenization * Assign variable oputside of nested loop
This commit is contained in:
parent
a48deb4081
commit
62b558ab72
|
@ -289,3 +289,29 @@ def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc[0:2], attrs=attrs)
|
retokenizer.merge(doc[0:2], attrs=attrs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_retokenizer_merge_lex_attrs(en_vocab):
|
||||||
|
"""Test that retokenization also sets attributes on the lexeme if they're
|
||||||
|
lexical attributes. For example, if a user sets IS_STOP, it should mean that
|
||||||
|
"all tokens with that lexeme" are marked as a stop word, so the ambiguity
|
||||||
|
here is acceptable. Also see #2390.
|
||||||
|
"""
|
||||||
|
# Test regular merging
|
||||||
|
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
||||||
|
assert not any(t.is_stop for t in doc)
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[0:2], attrs={"lemma": "hello world", "is_stop": True})
|
||||||
|
assert doc[0].lemma_ == "hello world"
|
||||||
|
assert doc[0].is_stop
|
||||||
|
# Test bulk merging
|
||||||
|
doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
|
||||||
|
assert not any(t.like_num for t in doc)
|
||||||
|
assert not any(t.is_stop for t in doc)
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[0:2], attrs={"like_num": True})
|
||||||
|
retokenizer.merge(doc[2:4], attrs={"is_stop": True})
|
||||||
|
assert doc[0].like_num
|
||||||
|
assert doc[1].is_stop
|
||||||
|
assert not doc[0].is_stop
|
||||||
|
assert not doc[1].like_num
|
||||||
|
|
|
@ -165,3 +165,21 @@ def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
heads = [(doc[0], 1), doc[1]]
|
heads = [(doc[0], 1), doc[1]]
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_retokenizer_split_lex_attrs(en_vocab):
|
||||||
|
"""Test that retokenization also sets attributes on the lexeme if they're
|
||||||
|
lexical attributes. For example, if a user sets IS_STOP, it should mean that
|
||||||
|
"all tokens with that lexeme" are marked as a stop word, so the ambiguity
|
||||||
|
here is acceptable. Also see #2390.
|
||||||
|
"""
|
||||||
|
assert not Doc(en_vocab, words=["Los"])[0].is_stop
|
||||||
|
assert not Doc(en_vocab, words=["Angeles"])[0].is_stop
|
||||||
|
doc = Doc(en_vocab, words=["LosAngeles", "start"])
|
||||||
|
assert not doc[0].is_stop
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
attrs = {"is_stop": [True, False]}
|
||||||
|
heads = [(doc[0], 1), doc[1]]
|
||||||
|
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
||||||
|
assert doc[0].is_stop
|
||||||
|
assert not doc[1].is_stop
|
||||||
|
|
|
@ -66,9 +66,13 @@ cdef class Retokenizer:
|
||||||
for extension in extensions:
|
for extension in extensions:
|
||||||
_validate_extensions(extension)
|
_validate_extensions(extension)
|
||||||
attrs = {key: value for key, value in attrs.items() if key != "_"}
|
attrs = {key: value for key, value in attrs.items() if key != "_"}
|
||||||
|
# NB: Since we support {"KEY": [value, value]} syntax here, this
|
||||||
|
# will only "intify" the keys, not the values
|
||||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||||
attrs["_"] = extensions
|
attrs["_"] = extensions
|
||||||
else:
|
else:
|
||||||
|
# NB: Since we support {"KEY": [value, value]} syntax here, this
|
||||||
|
# will only "intify" the keys, not the values
|
||||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||||
head_offsets = []
|
head_offsets = []
|
||||||
for head in heads:
|
for head in heads:
|
||||||
|
@ -153,7 +157,11 @@ def _merge(Doc doc, int start, int end, attributes):
|
||||||
elif attr_name == TAG:
|
elif attr_name == TAG:
|
||||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||||
else:
|
else:
|
||||||
|
# Set attributes on both token and lexeme to take care of token
|
||||||
|
# attribute vs. lexical attribute without having to enumerate them.
|
||||||
|
# If an attribute name is not valid, set_struct_attr will ignore it.
|
||||||
Token.set_struct_attr(token, attr_name, attr_value)
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
|
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
|
||||||
# Make sure ent_iob remains consistent
|
# Make sure ent_iob remains consistent
|
||||||
if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
|
if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
|
||||||
if token.ent_type == doc.c[end].ent_type:
|
if token.ent_type == doc.c[end].ent_type:
|
||||||
|
@ -216,6 +224,7 @@ def _bulk_merge(Doc doc, merges):
|
||||||
"""
|
"""
|
||||||
cdef Span span
|
cdef Span span
|
||||||
cdef const LexemeC* lex
|
cdef const LexemeC* lex
|
||||||
|
cdef TokenC* token
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
|
tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
|
||||||
spans = []
|
spans = []
|
||||||
|
@ -231,15 +240,6 @@ def _bulk_merge(Doc doc, merges):
|
||||||
# House the new merged token where it starts
|
# House the new merged token where it starts
|
||||||
token = &doc.c[start]
|
token = &doc.c[start]
|
||||||
tokens[merge_index] = token
|
tokens[merge_index] = token
|
||||||
# Assign attributes
|
|
||||||
for attr_name, attr_value in attributes.items():
|
|
||||||
if attr_name == "_": # Set extension attributes
|
|
||||||
for ext_attr_key, ext_attr_value in attr_value.items():
|
|
||||||
doc[start]._.set(ext_attr_key, ext_attr_value)
|
|
||||||
elif attr_name == TAG:
|
|
||||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
|
||||||
else:
|
|
||||||
Token.set_struct_attr(token, attr_name, attr_value)
|
|
||||||
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
||||||
# for the merged region. To do this, we create a boolean array indicating
|
# for the merged region. To do this, we create a boolean array indicating
|
||||||
# whether the row is to be deleted, then use numpy.delete
|
# whether the row is to be deleted, then use numpy.delete
|
||||||
|
@ -255,14 +255,30 @@ def _bulk_merge(Doc doc, merges):
|
||||||
# We update token.lex after keeping span root and dep, since
|
# We update token.lex after keeping span root and dep, since
|
||||||
# setting token.lex will change span.start and span.end properties
|
# setting token.lex will change span.start and span.end properties
|
||||||
# as it modifies the character offsets in the doc
|
# as it modifies the character offsets in the doc
|
||||||
for token_index in range(len(merges)):
|
for token_index, (span, attributes) in enumerate(merges):
|
||||||
new_orth = ''.join([t.text_with_ws for t in spans[token_index]])
|
new_orth = ''.join([t.text_with_ws for t in spans[token_index]])
|
||||||
if spans[token_index][-1].whitespace_:
|
if spans[token_index][-1].whitespace_:
|
||||||
new_orth = new_orth[:-len(spans[token_index][-1].whitespace_)]
|
new_orth = new_orth[:-len(spans[token_index][-1].whitespace_)]
|
||||||
|
token = tokens[token_index]
|
||||||
lex = doc.vocab.get(doc.mem, new_orth)
|
lex = doc.vocab.get(doc.mem, new_orth)
|
||||||
tokens[token_index].lex = lex
|
token.lex = lex
|
||||||
# We set trailing space here too
|
# We set trailing space here too
|
||||||
tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy
|
token.spacy = doc.c[spans[token_index].end-1].spacy
|
||||||
|
py_token = span[0]
|
||||||
|
# Assign attributes
|
||||||
|
for attr_name, attr_value in attributes.items():
|
||||||
|
if attr_name == "_": # Set extension attributes
|
||||||
|
for ext_attr_key, ext_attr_value in attr_value.items():
|
||||||
|
py_token._.set(ext_attr_key, ext_attr_value)
|
||||||
|
elif attr_name == TAG:
|
||||||
|
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||||
|
else:
|
||||||
|
# Set attributes on both token and lexeme to take care of token
|
||||||
|
# attribute vs. lexical attribute without having to enumerate
|
||||||
|
# them. If an attribute name is not valid, set_struct_attr will
|
||||||
|
# ignore it.
|
||||||
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
|
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
|
||||||
# Begin by setting all the head indices to absolute token positions
|
# Begin by setting all the head indices to absolute token positions
|
||||||
# This is easier to work with for now than the offsets
|
# This is easier to work with for now than the offsets
|
||||||
# Before thinking of something simpler, beware the case where a
|
# Before thinking of something simpler, beware the case where a
|
||||||
|
@ -281,7 +297,7 @@ def _bulk_merge(Doc doc, merges):
|
||||||
current_offset = 0
|
current_offset = 0
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
if current_span_index < len(spans) and i == spans[current_span_index].end:
|
if current_span_index < len(spans) and i == spans[current_span_index].end:
|
||||||
#last token was the last of the span
|
# Last token was the last of the span
|
||||||
current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1
|
current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1
|
||||||
current_span_index += 1
|
current_span_index += 1
|
||||||
if current_span_index < len(spans) and \
|
if current_span_index < len(spans) and \
|
||||||
|
@ -405,10 +421,17 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
||||||
if attr_name == "_":
|
if attr_name == "_":
|
||||||
for ext_attr_key, ext_attr_value in attr_value.items():
|
for ext_attr_key, ext_attr_value in attr_value.items():
|
||||||
doc[token_index + i]._.set(ext_attr_key, ext_attr_value)
|
doc[token_index + i]._.set(ext_attr_key, ext_attr_value)
|
||||||
|
# NB: We need to call get_string_id here because only the keys are
|
||||||
|
# "intified" (since we support "KEY": [value, value] syntax here).
|
||||||
elif attr_name == TAG:
|
elif attr_name == TAG:
|
||||||
doc.vocab.morphology.assign_tag(token, get_string_id(attr_value))
|
doc.vocab.morphology.assign_tag(token, get_string_id(attr_value))
|
||||||
else:
|
else:
|
||||||
|
# Set attributes on both token and lexeme to take care of token
|
||||||
|
# attribute vs. lexical attribute without having to enumerate
|
||||||
|
# them. If an attribute name is not valid, set_struct_attr will
|
||||||
|
# ignore it.
|
||||||
Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
|
Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
|
||||||
|
Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
|
||||||
# Assign correct dependencies to the inner token
|
# Assign correct dependencies to the inner token
|
||||||
for i, head in enumerate(heads):
|
for i, head in enumerate(heads):
|
||||||
doc.c[token_index + i].head = head
|
doc.c[token_index + i].head = head
|
||||||
|
|
|
@ -402,9 +402,11 @@ invalidated, although they may accidentally continue to work.
|
||||||
|
|
||||||
### Retokenizer.merge {#retokenizer.merge tag="method"}
|
### Retokenizer.merge {#retokenizer.merge tag="method"}
|
||||||
|
|
||||||
Mark a span for merging. The `attrs` will be applied to the resulting token.
|
Mark a span for merging. The `attrs` will be applied to the resulting token (if
|
||||||
Writable custom extension attributes can be provided as a dictionary mapping
|
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
|
||||||
attribute names to values as the `"_"` key.
|
underlying lexeme (if they're context-independent lexical attributes like
|
||||||
|
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a
|
||||||
|
dictionary mapping attribute names to values as the `"_"` key.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -431,7 +433,10 @@ second subtoken of `doc[3]`. This mechanism allows attaching subtokens to other
|
||||||
newly created subtokens, without having to keep track of the changing token
|
newly created subtokens, without having to keep track of the changing token
|
||||||
indices. If the specified head token will be split within the retokenizer block
|
indices. If the specified head token will be split within the retokenizer block
|
||||||
and no subtoken index is specified, it will default to `0`. Attributes to set on
|
and no subtoken index is specified, it will default to `0`. Attributes to set on
|
||||||
subtokens can be provided as a list of values.
|
subtokens can be provided as a list of values. They'll be applied to the
|
||||||
|
resulting token (if they're context-dependent token attributes like `LEMMA` or
|
||||||
|
`DEP`) or to the underlying lexeme (if they're context-independent lexical
|
||||||
|
attributes like `LOWER` or `IS_STOP`).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -995,6 +995,14 @@ with doc.retokenize() as retokenizer:
|
||||||
print("After:", [token.text for token in doc])
|
print("After:", [token.text for token in doc])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If an attribute in the `attrs` is a context-dependent token attribute, it will
|
||||||
|
be applied to the underlying [`Token`](/api/token). For example `LEMMA`, `POS`
|
||||||
|
or `DEP` only apply to a word in context, so they're token attributes. If an
|
||||||
|
attribute is a context-independent lexical attribute, it will be applied to the
|
||||||
|
underlying [`Lexeme`](/api/lexeme), the entry in the vocabulary. For example,
|
||||||
|
`LOWER` or `IS_STOP` apply to all words of the same spelling, regardless of the
|
||||||
|
context.
|
||||||
|
|
||||||
<Infobox title="Tip: merging entities and noun phrases">
|
<Infobox title="Tip: merging entities and noun phrases">
|
||||||
|
|
||||||
If you need to merge named entities or noun chunks, check out the built-in
|
If you need to merge named entities or noun chunks, check out the built-in
|
||||||
|
|
Loading…
Reference in New Issue
Block a user