spaCy/tests/tokens/test_token_references.py

53 lines
1.6 KiB
Python
Raw Normal View History

from __future__ import unicode_literals
import pytest
import gc
2015-07-13 23:30:01 +03:00
from spacy.en import English, LOCAL_DATA_DIR
import os
2015-07-13 23:30:01 +03:00
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
# Let this have its own instances, as we have to be careful about memory here
# that's the point, after all
def get_orphan_token(text, i):
2015-07-13 23:30:01 +03:00
nlp = English(load_vectors=False, data_dir=data_dir)
tokens = nlp(text)
gc.collect()
token = tokens[i]
del tokens
return token
def test_orphan():
orphan = get_orphan_token('An orphan token', 1)
gc.collect()
dummy = get_orphan_token('Load and flush the memory', 0)
dummy = get_orphan_token('Load again...', 0)
assert orphan.orth_ == 'orphan'
2015-07-13 23:30:01 +03:00
assert orphan.pos_ in ('ADJ', 'NOUN')
assert orphan.head.orth_ == 'token'
2015-02-16 19:49:31 +03:00
def _orphan_from_list(toks):
''' Take the tokens from nlp(), append them to a list, return the list '''
lst = []
for tok in toks:
lst.append(tok)
return lst
2015-04-19 22:39:18 +03:00
2015-02-16 19:49:31 +03:00
def test_list_orphans():
# Test case from NSchrading
2015-07-13 23:30:01 +03:00
nlp = English(load_vectors=False, data_dir=data_dir)
2015-02-16 19:49:31 +03:00
samples = ["a", "test blah wat okay"]
lst = []
for sample in samples:
# Go through all the samples, call nlp() on each to get tokens,
# pass those tokens to the _orphan_from_list() function, get a list back
# and put all results in another list
lst.extend(_orphan_from_list(nlp(sample)))
# go through the list of all tokens and try to print orth_
orths = ['a', 'test', 'blah', 'wat', 'okay']
for i, l in enumerate(lst):
assert l.orth_ == orths[i]