* Add test to check how well we match ptb tokenizer. Needs more text.

2025-11-09 20:38:06 +03:00 · 2014-07-07 05:11:31 +02:00 · 2014-07-07 05:11:31 +02:00 · e60b958b7d
commit e60b958b7d
parent 2c431f9fdc
1 changed files with 46 additions and 0 deletions
--- a/tests/test_ptb_match_wiki_sun.py
+++ b/tests/test_ptb_match_wiki_sun.py
@ -0,0 +1,46 @@
 from __future__ import unicode_literals
 from spacy.en import unhash
 from spacy import lex_of
 from spacy.util import utf8open
 from spacy.en_ptb import tokenize, lookup, unhash
 import pytest
 import os
 from os import path
 HERE = path.dirname(__file__)
@pytest.fixture
 def sun_txt():
    loc = path.join(HERE, 'sun.txt')
    return utf8open(loc).read()
@pytest.fixture
 def my_tokens(sun_txt):
    assert len(sun_txt) != 0
    tokens = tokenize(sun_txt)
    return [unhash(lex_of(t)) for t in tokens]
@pytest.fixture
 def sed_tokens():
    loc = path.join(HERE, 'sun.tokens')
    return utf8open(loc).read().split()
 def test_compare_tokens(my_tokens, sed_tokens):
    me = my_tokens
    sed = sed_tokens
    i = 0
    while i < len(me) and i < len(sed):
        assert me[i] == sed[i]
        i += 1
    assert len(me) == len(sed)