From 9815c7649ef7c4b2e6bc5ada80472060b4c68a7e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 23 Aug 2014 19:55:06 +0200 Subject: [PATCH] * Refactor around Word objects, adapting tests. Tests passing, except for string views. --- spacy/ptb3.pxd | 7 ++- spacy/ptb3.pyx | 9 ++-- tests/{test_group_by.py => _depr_group_by.py} | 0 tests/test_contractions.py | 20 ++++---- tests/test_orth.py | 15 +++--- tests/test_post_punct.py | 15 +++--- tests/test_pre_punct.py | 17 ++++--- tests/test_ptb_match_wiki_sun.py | 46 ------------------- tests/test_surround_punct.py | 19 ++++---- tests/test_tokenizer.py | 28 ++++++----- tests/test_vocab.py | 6 +-- 11 files changed, 65 insertions(+), 117 deletions(-) rename tests/{test_group_by.py => _depr_group_by.py} (100%) delete mode 100644 tests/test_ptb_match_wiki_sun.py diff --git a/spacy/ptb3.pxd b/spacy/ptb3.pxd index 58773978f..54b8ad12a 100644 --- a/spacy/ptb3.pxd +++ b/spacy/ptb3.pxd @@ -1,7 +1,6 @@ from spacy.spacy cimport Language -from spacy.lexeme cimport LexID -from spacy.tokens cimport Tokens from spacy.lexeme cimport StringHash +from spacy.word cimport Word cdef class PennTreebank3(Language): @@ -10,6 +9,6 @@ cdef class PennTreebank3(Language): cdef PennTreebank3 PTB3 -cpdef LexID lookup(unicode word) except 0 -cpdef Tokens tokenize(unicode string) +cpdef Word lookup(unicode word) +cpdef list tokenize(unicode string) cpdef unicode unhash(StringHash hash_value) diff --git a/spacy/ptb3.pyx b/spacy/ptb3.pyx index b394f9473..80efac36a 100644 --- a/spacy/ptb3.pyx +++ b/spacy/ptb3.pyx @@ -77,18 +77,21 @@ def nltk_regex_tokenize(text): cdef class PennTreebank3(Language): cpdef list find_substrings(self, unicode chunk): strings = nltk_regex_tokenize(chunk) + if strings[-1] == '.': + strings.pop() + strings[-1] += '.' assert strings return strings cdef PennTreebank3 PTB3 = PennTreebank3('ptb3') -cpdef Tokens tokenize(unicode string): +cpdef list tokenize(unicode string): return PTB3.tokenize(string) -cpdef LexID lookup(unicode string) except 0: - return PTB3.lookup(string) +cpdef Word lookup(unicode string): + return PTB3.lookup(string) cpdef unicode unhash(StringHash hash_value): diff --git a/tests/test_group_by.py b/tests/_depr_group_by.py similarity index 100% rename from tests/test_group_by.py rename to tests/_depr_group_by.py diff --git a/tests/test_contractions.py b/tests/test_contractions.py index 1839b15f5..82f975b27 100644 --- a/tests/test_contractions.py +++ b/tests/test_contractions.py @@ -2,35 +2,33 @@ from __future__ import unicode_literals from spacy.en import tokenize, lookup, unhash -from spacy import lex_of - def test_possess(): tokens = tokenize("Mike's") - assert unhash(lex_of(tokens[0])) == "Mike" - assert unhash(lex_of(tokens[1])) == "'s" + assert unhash(tokens[0].lex) == "Mike" + assert unhash(tokens[1].lex) == "'s" assert len(tokens) == 2 def test_apostrophe(): tokens = tokenize("schools'") assert len(tokens) == 2 - assert unhash(lex_of(tokens[1])) == "'" - assert unhash(lex_of(tokens[0])) == "schools" + assert unhash(tokens[1].lex) == "'" + assert unhash(tokens[0].lex) == "schools" def test_LL(): tokens = tokenize("we'll") assert len(tokens) == 2 - assert unhash(lex_of(tokens[1])) == "will" - assert unhash(lex_of(tokens[0])) == "we" + assert unhash(tokens[1].lex) == "will" + assert unhash(tokens[0].lex) == "we" def test_aint(): tokens = tokenize("ain't") assert len(tokens) == 2 - assert unhash(lex_of(tokens[0])) == "are" - assert unhash(lex_of(tokens[1])) == "not" + assert unhash(tokens[0].lex) == "are" + assert unhash(tokens[1].lex) == "not" def test_capitalized(): @@ -40,4 +38,4 @@ def test_capitalized(): assert len(tokens) == 2 tokens = tokenize("Ain't") assert len(tokens) == 2 - assert unhash(lex_of(tokens[0])) == "Are" + assert unhash(tokens[0].lex) == "Are" diff --git a/tests/test_orth.py b/tests/test_orth.py index 7f333c941..4dbc3cd5f 100644 --- a/tests/test_orth.py +++ b/tests/test_orth.py @@ -3,8 +3,8 @@ from __future__ import unicode_literals import pytest from spacy.en import lookup, unhash +import spacy.word -from spacy.en import lex_of, shape_of, norm_of, first_of, length_of @pytest.fixture def C3P0(): @@ -12,17 +12,16 @@ def C3P0(): def test_shape(C3P0): - assert unhash(shape_of(C3P0)) == "XdXd" + # TODO: Fix this + assert unhash(C3P0.get_view(2)) == "XdXd" def test_length(): t = lookup('the') - assert length_of(t) == 3 - #t = lookup('') - #assert length_of(t) == 0 + assert t.length == 3 t = lookup("n't") - assert length_of(t) == 3 + assert t.length == 3 t = lookup("'s") - assert length_of(t) == 2 + assert t.length == 2 t = lookup('Xxxx') - assert length_of(t) == 4 + assert t.length == 4 diff --git a/tests/test_post_punct.py b/tests/test_post_punct.py index f8391235a..e5d2d0705 100644 --- a/tests/test_post_punct.py +++ b/tests/test_post_punct.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -from spacy import lex_of from spacy.en import lookup from spacy.en import tokenize from spacy.en import unhash @@ -19,8 +18,8 @@ def test_close(close_puncts): string = word_str + p tokens = tokenize(string) assert len(tokens) == 2 - assert unhash(lex_of(tokens[1])) == p - assert unhash(lex_of(tokens[0])) == word_str + assert unhash(tokens[1].lex) == p + assert unhash(tokens[0].lex) == word_str def test_two_different_close(close_puncts): @@ -29,9 +28,9 @@ def test_two_different_close(close_puncts): string = word_str + p + "'" tokens = tokenize(string) assert len(tokens) == 3 - assert unhash(lex_of(tokens[0])) == word_str - assert unhash(lex_of(tokens[1])) == p - assert unhash(lex_of(tokens[2])) == "'" + assert unhash(tokens[0].lex) == word_str + assert unhash(tokens[1].lex) == p + assert unhash(tokens[2].lex) == "'" def test_three_same_close(close_puncts): @@ -40,5 +39,5 @@ def test_three_same_close(close_puncts): string = word_str + p + p + p tokens = tokenize(string) assert len(tokens) == 4 - assert unhash(lex_of(tokens[0])) == word_str - assert unhash(lex_of(tokens[1])) == p + assert unhash(tokens[0].lex) == word_str + assert unhash(tokens[1].lex) == p diff --git a/tests/test_pre_punct.py b/tests/test_pre_punct.py index 5a4a4d072..83e743c44 100644 --- a/tests/test_pre_punct.py +++ b/tests/test_pre_punct.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -from spacy import lex_of from spacy.en import lookup from spacy.en import tokenize from spacy.en import unhash @@ -19,8 +18,8 @@ def test_open(open_puncts): string = p + word_str tokens = tokenize(string) assert len(tokens) == 2 - assert unhash(lex_of(tokens[0])) == p - assert unhash(lex_of(tokens[1])) == word_str + assert unhash(tokens[0].lex) == p + assert unhash(tokens[1].lex) == word_str def test_two_different_open(open_puncts): @@ -29,9 +28,9 @@ def test_two_different_open(open_puncts): string = p + "`" + word_str tokens = tokenize(string) assert len(tokens) == 3 - assert unhash(lex_of(tokens[0])) == p - assert unhash(lex_of(tokens[1])) == "`" - assert unhash(lex_of(tokens[2])) == word_str + assert unhash(tokens[0].lex) == p + assert unhash(tokens[1].lex) == "`" + assert unhash(tokens[2].lex) == word_str def test_three_same_open(open_puncts): @@ -40,12 +39,12 @@ def test_three_same_open(open_puncts): string = p + p + p + word_str tokens = tokenize(string) assert len(tokens) == 4 - assert unhash(lex_of(tokens[0])) == p - assert unhash(lex_of(tokens[3])) == word_str + assert unhash(tokens[0].lex) == p + assert unhash(tokens[3].lex) == word_str def test_open_appostrophe(): string = "'The" tokens = tokenize(string) assert len(tokens) == 2 - assert unhash(lex_of(tokens[0])) == "'" + assert unhash(tokens[0].lex) == "'" diff --git a/tests/test_ptb_match_wiki_sun.py b/tests/test_ptb_match_wiki_sun.py deleted file mode 100644 index 5d6306cfc..000000000 --- a/tests/test_ptb_match_wiki_sun.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import unicode_literals - -from spacy.en import unhash -from spacy import lex_of -from spacy.util import utf8open -from spacy.ptb3 import tokenize, lookup, unhash - -import pytest -import os -from os import path - - -HERE = path.dirname(__file__) - - -@pytest.fixture -def sun_txt(): - loc = path.join(HERE, 'sun.txt') - return utf8open(loc).read() - - -@pytest.fixture -def my_tokens(sun_txt): - assert len(sun_txt) != 0 - tokens = tokenize(sun_txt) - return [unhash(lex_of(t)) for t in tokens] - - -@pytest.fixture -def sed_tokens(): - loc = path.join(HERE, 'sun.tokens') - return utf8open(loc).read().split() - - -def test_compare_tokens(my_tokens, sed_tokens): - me = my_tokens - sed = sed_tokens - i = 0 - while i < len(me) and i < len(sed): - assert me[i] == sed[i] - i += 1 - - assert len(me) == len(sed) - - - diff --git a/tests/test_surround_punct.py b/tests/test_surround_punct.py index 2c3a7f837..686d8cfc2 100644 --- a/tests/test_surround_punct.py +++ b/tests/test_surround_punct.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -from spacy import lex_of from spacy.en import tokenize from spacy.en import lookup from spacy.en import unhash @@ -19,9 +18,9 @@ def test_token(paired_puncts): string = open_ + word_str + close_ tokens = tokenize(string) assert len(tokens) == 3 - assert unhash(lex_of(tokens[0])) == open_ - assert unhash(lex_of(tokens[1])) == word_str - assert unhash(lex_of(tokens[2])) == close_ + assert unhash(tokens[0].lex) == open_ + assert unhash(tokens[1].lex) == word_str + assert unhash(tokens[2].lex) == close_ def test_two_different(paired_puncts): @@ -30,9 +29,9 @@ def test_two_different(paired_puncts): string = "`" + open_ + word_str + close_ + "'" tokens = tokenize(string) assert len(tokens) == 5 - assert unhash(lex_of(tokens[0])) == "`" - assert unhash(lex_of(tokens[1])) == open_ - assert unhash(lex_of(tokens[2])) == word_str - assert unhash(lex_of(tokens[2])) == word_str - assert unhash(lex_of(tokens[3])) == close_ - assert unhash(lex_of(tokens[4])) == "'" + assert unhash(tokens[0].lex) == "`" + assert unhash(tokens[1].lex) == open_ + assert unhash(tokens[2].lex) == word_str + assert unhash(tokens[2].lex) == word_str + assert unhash(tokens[3].lex) == close_ + assert unhash(tokens[4].lex) == "'" diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index a0dbdc129..c99d387ce 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -3,8 +3,6 @@ from __future__ import unicode_literals from spacy.en import tokenize from spacy.en import lookup -from spacy.lexeme import lex_of - def test_single_word(): lex_ids = tokenize(u'hello') @@ -12,33 +10,33 @@ def test_single_word(): def test_two_words(): - lex_ids = tokenize(u'hello possums') - assert len(lex_ids) == 2 - assert lex_ids[0] == lookup(u'hello') - assert lex_ids[0] != lex_ids[1] + words = tokenize('hello possums') + assert len(words) == 2 + assert words[0] == lookup('hello') + assert words[0] != words[1] def test_punct(): tokens = tokenize('hello, possums.') assert len(tokens) == 4 - assert lex_of(tokens[0]) == lex_of(lookup('hello')) - assert lex_of(tokens[1]) == lex_of(lookup(',')) - assert lex_of(tokens[2]) == lex_of(lookup('possums')) - assert lex_of(tokens[1]) != lex_of(lookup('hello')) + assert tokens[0].lex == lookup('hello').lex + assert tokens[1].lex == lookup(',').lex + assert tokens[2].lex == lookup('possums').lex + assert tokens[1].lex != lookup('hello').lex def test_digits(): lex_ids = tokenize('The year: 1984.') assert len(lex_ids) == 5 - assert lex_of(lex_ids[0]) == lex_of(lookup('The')) - assert lex_of(lex_ids[3]) == lex_of(lookup('1984')) - assert lex_of(lex_ids[4]) == lex_of(lookup('.')) + assert lex_ids[0].lex == lookup('The').lex + assert lex_ids[3].lex == lookup('1984').lex + assert lex_ids[4].lex == lookup('.').lex def test_contraction(): lex_ids = tokenize("don't giggle") assert len(lex_ids) == 3 - assert lex_of(lex_ids[1]) == lex_of(lookup("not")) + assert lex_ids[1].lex == lookup("not").lex lex_ids = tokenize("i said don't!") assert len(lex_ids) == 4 - assert lex_of(lex_ids[3]) == lex_of(lookup('!')) + assert lex_ids[3].lex == lookup('!').lex diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 01290a10b..6128e728a 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -17,7 +17,7 @@ def test_eq(): def test_round_trip(): hello = lookup('Hello') - assert unhash(lex_of(hello)) == 'Hello' + assert unhash(hello.lex) == 'Hello' def test_case_neq(): @@ -32,6 +32,6 @@ def test_punct_neq(): def test_short(): addr = lookup('I') - assert unhash(lex_of(addr)) == 'I' + assert unhash(addr.lex) == 'I' addr = lookup('not') - assert unhash(lex_of(addr)) == 'not' + assert unhash(addr.lex) == 'not'