mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Add test to check how well we match ptb tokenizer. Needs more text.
This commit is contained in:
parent
2c431f9fdc
commit
e60b958b7d
46
tests/test_ptb_match_wiki_sun.py
Normal file
46
tests/test_ptb_match_wiki_sun.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.en import unhash
|
||||||
|
from spacy import lex_of
|
||||||
|
from spacy.util import utf8open
|
||||||
|
from spacy.en_ptb import tokenize, lookup, unhash
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import os
|
||||||
|
from os import path
|
||||||
|
|
||||||
|
|
||||||
|
HERE = path.dirname(__file__)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sun_txt():
|
||||||
|
loc = path.join(HERE, 'sun.txt')
|
||||||
|
return utf8open(loc).read()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def my_tokens(sun_txt):
|
||||||
|
assert len(sun_txt) != 0
|
||||||
|
tokens = tokenize(sun_txt)
|
||||||
|
return [unhash(lex_of(t)) for t in tokens]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sed_tokens():
|
||||||
|
loc = path.join(HERE, 'sun.tokens')
|
||||||
|
return utf8open(loc).read().split()
|
||||||
|
|
||||||
|
|
||||||
|
def test_compare_tokens(my_tokens, sed_tokens):
|
||||||
|
me = my_tokens
|
||||||
|
sed = sed_tokens
|
||||||
|
i = 0
|
||||||
|
while i < len(me) and i < len(sed):
|
||||||
|
assert me[i] == sed[i]
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
assert len(me) == len(sed)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user