spaCy/spacy/tests/tokens/test_noun_chunks.py
2016-04-08 16:46:44 +02:00

36 lines
1.1 KiB
Python

import numpy as np
from spacy.attrs import HEAD, DEP
from spacy.symbols import nsubj, dobj, punct, amod, nmod, conj, cc, root
from spacy.en import English
from spacy.syntax.iterators import EnglishNounChunks
def test_not_nested():
nlp = English(parser=False, entity=False)
sent = u'''Peter has chronic command and control issues'''.strip()
tokens = nlp(sent)
tokens.from_array(
[HEAD, DEP],
np.asarray(
[
[1, nsubj],
[0, root],
[4, amod],
[3, nmod],
[-1, cc],
[-2, conj],
[-5, dobj]
], dtype='int32'))
tokens.noun_chunks = EnglishNounChunks
for chunk in tokens.noun_chunks:
print(chunk.text)
word_occurred = {}
for chunk in tokens.noun_chunks:
for word in chunk:
word_occurred.setdefault(word.text, 0)
word_occurred[word.text] += 1
for word, freq in word_occurred.items():
assert freq == 1, (word, [chunk.text for chunk in tokens.noun_chunks])