mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Rename tags to pos in get_doc and allow adding tags to tokens
This commit is contained in:
parent
1add8ace67
commit
a6790b6694
|
@ -19,9 +19,9 @@ def test_doc_array_attr_of_token(en_tokenizer, en_vocab):
|
||||||
|
|
||||||
def test_doc_array_tag(en_tokenizer):
|
def test_doc_array_tag(en_tokenizer):
|
||||||
text = "A nice sentence."
|
text = "A nice sentence."
|
||||||
tags = ['DET', 'ADJ', 'NOUN', 'PUNCT']
|
pos = ['DET', 'ADJ', 'NOUN', 'PUNCT']
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags)
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], pos=pos)
|
||||||
assert doc[0].pos != doc[1].pos != doc[2].pos != doc[3].pos
|
assert doc[0].pos != doc[1].pos != doc[2].pos != doc[3].pos
|
||||||
feats_array = doc.to_array((ORTH, POS))
|
feats_array = doc.to_array((ORTH, POS))
|
||||||
assert feats_array[0][1] == doc[0].pos
|
assert feats_array[0][1] == doc[0].pos
|
||||||
|
|
|
@ -10,12 +10,12 @@ import numpy
|
||||||
|
|
||||||
def test_doc_token_api_strings(en_tokenizer):
|
def test_doc_token_api_strings(en_tokenizer):
|
||||||
text = "Give it back! He pleaded."
|
text = "Give it back! He pleaded."
|
||||||
tags = ['VERB', 'PRON', 'PART', 'PUNCT', 'PRON', 'VERB', 'PUNCT']
|
pos = ['VERB', 'PRON', 'PART', 'PUNCT', 'PRON', 'VERB', 'PUNCT']
|
||||||
heads = [0, -1, -2, -3, 1, 0, -1]
|
heads = [0, -1, -2, -3, 1, 0, -1]
|
||||||
deps = ['ROOT', 'dobj', 'prt', 'punct', 'nsubj', 'ROOT', 'punct']
|
deps = ['ROOT', 'dobj', 'prt', 'punct', 'nsubj', 'ROOT', 'punct']
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags, heads, deps)
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], pos=pos, heads=heads, deps=deps)
|
||||||
assert doc[0].orth_ == 'Give'
|
assert doc[0].orth_ == 'Give'
|
||||||
assert doc[0].text == 'Give'
|
assert doc[0].text == 'Give'
|
||||||
assert doc[0].text_with_ws == 'Give '
|
assert doc[0].text_with_ws == 'Give '
|
||||||
|
|
|
@ -63,9 +63,7 @@ def test_parser_merge_pp(en_tokenizer):
|
||||||
tags = ['DT', 'NN', 'IN', 'DT', 'NN', 'VBZ']
|
tags = ['DT', 'NN', 'IN', 'DT', 'NN', 'VBZ']
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], deps=deps, heads=heads)
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], deps=deps, heads=heads, tags=tags)
|
||||||
for token in doc:
|
|
||||||
token.tag_ = tags[token.i]
|
|
||||||
nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_) for np in doc.noun_chunks]
|
nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_) for np in doc.noun_chunks]
|
||||||
|
|
||||||
for start, end, lemma in nps:
|
for start, end, lemma in nps:
|
||||||
|
|
|
@ -5,19 +5,22 @@ from ..tokens import Doc
|
||||||
from ..attrs import ORTH, POS, HEAD, DEP
|
from ..attrs import ORTH, POS, HEAD, DEP
|
||||||
|
|
||||||
|
|
||||||
def get_doc(vocab, words=[], tags=None, heads=None, deps=None):
|
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None):
|
||||||
"""Create Doc object from given vocab, words and annotations."""
|
"""Create Doc object from given vocab, words and annotations."""
|
||||||
tags = tags or [''] * len(words)
|
pos = pos or [''] * len(words)
|
||||||
heads = heads or [0] * len(words)
|
heads = heads or [0] * len(words)
|
||||||
deps = deps or [''] * len(words)
|
deps = deps or [''] * len(words)
|
||||||
|
|
||||||
doc = Doc(vocab, words=words)
|
doc = Doc(vocab, words=words)
|
||||||
attrs = doc.to_array([POS, HEAD, DEP])
|
attrs = doc.to_array([POS, HEAD, DEP])
|
||||||
for i, (tag, head, dep) in enumerate(zip(tags, heads, deps)):
|
for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
|
||||||
attrs[i, 0] = doc.vocab.strings[tag]
|
attrs[i, 0] = doc.vocab.strings[p]
|
||||||
attrs[i, 1] = head
|
attrs[i, 1] = head
|
||||||
attrs[i, 2] = doc.vocab.strings[dep]
|
attrs[i, 2] = doc.vocab.strings[dep]
|
||||||
doc.from_array([POS, HEAD, DEP], attrs)
|
doc.from_array([POS, HEAD, DEP], attrs)
|
||||||
|
if tags:
|
||||||
|
for token in doc:
|
||||||
|
token.tag_ = tags[token.i]
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user