spaCy/spacy/tests/regression/test_issue5048.py

import numpy
from spacy.tokens import Doc
from spacy.attrs import DEP, POS, TAG

from ..util import get_doc


def test_issue5048(en_vocab):
    words = ["This", "is", "a", "sentence"]
    pos_s = ["DET", "VERB", "DET", "NOUN"]
    spaces = [" ", " ", " ", ""]
    deps_s = ["dep", "adj", "nn", "atm"]
    tags_s = ["DT", "VBZ", "DT", "NN"]

    strings = en_vocab.strings

    for w in words:
        strings.add(w)
    deps = [strings.add(d) for d in deps_s]
    pos = [strings.add(p) for p in pos_s]
    tags = [strings.add(t) for t in tags_s]

    attrs = [POS, DEP, TAG]
    array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")

    doc = Doc(en_vocab, words=words, spaces=spaces)
    doc.from_array(attrs, array)
    v1 = [(token.text, token.pos_, token.tag_) for token in doc]

    doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
    v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
    assert v1 == v2
Bugfix/get doc (#5049) * new (broken) unit test * fixing get_doc method 2020-03-02 13:49:28 +03:00			`import numpy`
			`from spacy.tokens import Doc`
			`from spacy.attrs import DEP, POS, TAG`

			`from ..util import get_doc`


			`def test_issue5048(en_vocab):`
			`words = ["This", "is", "a", "sentence"]`
			`pos_s = ["DET", "VERB", "DET", "NOUN"]`
			`spaces = [" ", " ", " ", ""]`
			`deps_s = ["dep", "adj", "nn", "atm"]`
			`tags_s = ["DT", "VBZ", "DT", "NN"]`

			`strings = en_vocab.strings`

			`for w in words:`
			`strings.add(w)`
			`deps = [strings.add(d) for d in deps_s]`
			`pos = [strings.add(p) for p in pos_s]`
			`tags = [strings.add(t) for t in tags_s]`

			`attrs = [POS, DEP, TAG]`
			`array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")`

			`doc = Doc(en_vocab, words=words, spaces=spaces)`
			`doc.from_array(attrs, array)`
			`v1 = [(token.text, token.pos_, token.tag_) for token in doc]`

			`doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)`
			`v2 = [(token.text, token.pos_, token.tag_) for token in doc2]`
			`assert v1 == v2`