* Refactor around Word objects, adapting tests. Tests passing, except for string views.

This commit is contained in:
Matthew Honnibal 2014-08-23 19:55:06 +02:00
parent 4f01df9152
commit 9815c7649e
11 changed files with 65 additions and 117 deletions

View File

@ -1,7 +1,6 @@
from spacy.spacy cimport Language from spacy.spacy cimport Language
from spacy.lexeme cimport LexID
from spacy.tokens cimport Tokens
from spacy.lexeme cimport StringHash from spacy.lexeme cimport StringHash
from spacy.word cimport Word
cdef class PennTreebank3(Language): cdef class PennTreebank3(Language):
@ -10,6 +9,6 @@ cdef class PennTreebank3(Language):
cdef PennTreebank3 PTB3 cdef PennTreebank3 PTB3
cpdef LexID lookup(unicode word) except 0 cpdef Word lookup(unicode word)
cpdef Tokens tokenize(unicode string) cpdef list tokenize(unicode string)
cpdef unicode unhash(StringHash hash_value) cpdef unicode unhash(StringHash hash_value)

View File

@ -77,18 +77,21 @@ def nltk_regex_tokenize(text):
cdef class PennTreebank3(Language): cdef class PennTreebank3(Language):
cpdef list find_substrings(self, unicode chunk): cpdef list find_substrings(self, unicode chunk):
strings = nltk_regex_tokenize(chunk) strings = nltk_regex_tokenize(chunk)
if strings[-1] == '.':
strings.pop()
strings[-1] += '.'
assert strings assert strings
return strings return strings
cdef PennTreebank3 PTB3 = PennTreebank3('ptb3') cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')
cpdef Tokens tokenize(unicode string): cpdef list tokenize(unicode string):
return PTB3.tokenize(string) return PTB3.tokenize(string)
cpdef LexID lookup(unicode string) except 0: cpdef Word lookup(unicode string):
return <LexID>PTB3.lookup(string) return PTB3.lookup(string)
cpdef unicode unhash(StringHash hash_value): cpdef unicode unhash(StringHash hash_value):

View File

@ -2,35 +2,33 @@ from __future__ import unicode_literals
from spacy.en import tokenize, lookup, unhash from spacy.en import tokenize, lookup, unhash
from spacy import lex_of
def test_possess(): def test_possess():
tokens = tokenize("Mike's") tokens = tokenize("Mike's")
assert unhash(lex_of(tokens[0])) == "Mike" assert unhash(tokens[0].lex) == "Mike"
assert unhash(lex_of(tokens[1])) == "'s" assert unhash(tokens[1].lex) == "'s"
assert len(tokens) == 2 assert len(tokens) == 2
def test_apostrophe(): def test_apostrophe():
tokens = tokenize("schools'") tokens = tokenize("schools'")
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[1])) == "'" assert unhash(tokens[1].lex) == "'"
assert unhash(lex_of(tokens[0])) == "schools" assert unhash(tokens[0].lex) == "schools"
def test_LL(): def test_LL():
tokens = tokenize("we'll") tokens = tokenize("we'll")
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[1])) == "will" assert unhash(tokens[1].lex) == "will"
assert unhash(lex_of(tokens[0])) == "we" assert unhash(tokens[0].lex) == "we"
def test_aint(): def test_aint():
tokens = tokenize("ain't") tokens = tokenize("ain't")
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "are" assert unhash(tokens[0].lex) == "are"
assert unhash(lex_of(tokens[1])) == "not" assert unhash(tokens[1].lex) == "not"
def test_capitalized(): def test_capitalized():
@ -40,4 +38,4 @@ def test_capitalized():
assert len(tokens) == 2 assert len(tokens) == 2
tokens = tokenize("Ain't") tokens = tokenize("Ain't")
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "Are" assert unhash(tokens[0].lex) == "Are"

View File

@ -3,8 +3,8 @@ from __future__ import unicode_literals
import pytest import pytest
from spacy.en import lookup, unhash from spacy.en import lookup, unhash
import spacy.word
from spacy.en import lex_of, shape_of, norm_of, first_of, length_of
@pytest.fixture @pytest.fixture
def C3P0(): def C3P0():
@ -12,17 +12,16 @@ def C3P0():
def test_shape(C3P0): def test_shape(C3P0):
assert unhash(shape_of(C3P0)) == "XdXd" # TODO: Fix this
assert unhash(C3P0.get_view(2)) == "XdXd"
def test_length(): def test_length():
t = lookup('the') t = lookup('the')
assert length_of(t) == 3 assert t.length == 3
#t = lookup('')
#assert length_of(t) == 0
t = lookup("n't") t = lookup("n't")
assert length_of(t) == 3 assert t.length == 3
t = lookup("'s") t = lookup("'s")
assert length_of(t) == 2 assert t.length == 2
t = lookup('Xxxx') t = lookup('Xxxx')
assert length_of(t) == 4 assert t.length == 4

View File

@ -1,6 +1,5 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy import lex_of
from spacy.en import lookup from spacy.en import lookup
from spacy.en import tokenize from spacy.en import tokenize
from spacy.en import unhash from spacy.en import unhash
@ -19,8 +18,8 @@ def test_close(close_puncts):
string = word_str + p string = word_str + p
tokens = tokenize(string) tokens = tokenize(string)
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[1])) == p assert unhash(tokens[1].lex) == p
assert unhash(lex_of(tokens[0])) == word_str assert unhash(tokens[0].lex) == word_str
def test_two_different_close(close_puncts): def test_two_different_close(close_puncts):
@ -29,9 +28,9 @@ def test_two_different_close(close_puncts):
string = word_str + p + "'" string = word_str + p + "'"
tokens = tokenize(string) tokens = tokenize(string)
assert len(tokens) == 3 assert len(tokens) == 3
assert unhash(lex_of(tokens[0])) == word_str assert unhash(tokens[0].lex) == word_str
assert unhash(lex_of(tokens[1])) == p assert unhash(tokens[1].lex) == p
assert unhash(lex_of(tokens[2])) == "'" assert unhash(tokens[2].lex) == "'"
def test_three_same_close(close_puncts): def test_three_same_close(close_puncts):
@ -40,5 +39,5 @@ def test_three_same_close(close_puncts):
string = word_str + p + p + p string = word_str + p + p + p
tokens = tokenize(string) tokens = tokenize(string)
assert len(tokens) == 4 assert len(tokens) == 4
assert unhash(lex_of(tokens[0])) == word_str assert unhash(tokens[0].lex) == word_str
assert unhash(lex_of(tokens[1])) == p assert unhash(tokens[1].lex) == p

View File

@ -1,6 +1,5 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy import lex_of
from spacy.en import lookup from spacy.en import lookup
from spacy.en import tokenize from spacy.en import tokenize
from spacy.en import unhash from spacy.en import unhash
@ -19,8 +18,8 @@ def test_open(open_puncts):
string = p + word_str string = p + word_str
tokens = tokenize(string) tokens = tokenize(string)
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == p assert unhash(tokens[0].lex) == p
assert unhash(lex_of(tokens[1])) == word_str assert unhash(tokens[1].lex) == word_str
def test_two_different_open(open_puncts): def test_two_different_open(open_puncts):
@ -29,9 +28,9 @@ def test_two_different_open(open_puncts):
string = p + "`" + word_str string = p + "`" + word_str
tokens = tokenize(string) tokens = tokenize(string)
assert len(tokens) == 3 assert len(tokens) == 3
assert unhash(lex_of(tokens[0])) == p assert unhash(tokens[0].lex) == p
assert unhash(lex_of(tokens[1])) == "`" assert unhash(tokens[1].lex) == "`"
assert unhash(lex_of(tokens[2])) == word_str assert unhash(tokens[2].lex) == word_str
def test_three_same_open(open_puncts): def test_three_same_open(open_puncts):
@ -40,12 +39,12 @@ def test_three_same_open(open_puncts):
string = p + p + p + word_str string = p + p + p + word_str
tokens = tokenize(string) tokens = tokenize(string)
assert len(tokens) == 4 assert len(tokens) == 4
assert unhash(lex_of(tokens[0])) == p assert unhash(tokens[0].lex) == p
assert unhash(lex_of(tokens[3])) == word_str assert unhash(tokens[3].lex) == word_str
def test_open_appostrophe(): def test_open_appostrophe():
string = "'The" string = "'The"
tokens = tokenize(string) tokens = tokenize(string)
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "'" assert unhash(tokens[0].lex) == "'"

View File

@ -1,46 +0,0 @@
from __future__ import unicode_literals
from spacy.en import unhash
from spacy import lex_of
from spacy.util import utf8open
from spacy.ptb3 import tokenize, lookup, unhash
import pytest
import os
from os import path
HERE = path.dirname(__file__)
@pytest.fixture
def sun_txt():
loc = path.join(HERE, 'sun.txt')
return utf8open(loc).read()
@pytest.fixture
def my_tokens(sun_txt):
assert len(sun_txt) != 0
tokens = tokenize(sun_txt)
return [unhash(lex_of(t)) for t in tokens]
@pytest.fixture
def sed_tokens():
loc = path.join(HERE, 'sun.tokens')
return utf8open(loc).read().split()
def test_compare_tokens(my_tokens, sed_tokens):
me = my_tokens
sed = sed_tokens
i = 0
while i < len(me) and i < len(sed):
assert me[i] == sed[i]
i += 1
assert len(me) == len(sed)

View File

@ -1,6 +1,5 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy import lex_of
from spacy.en import tokenize from spacy.en import tokenize
from spacy.en import lookup from spacy.en import lookup
from spacy.en import unhash from spacy.en import unhash
@ -19,9 +18,9 @@ def test_token(paired_puncts):
string = open_ + word_str + close_ string = open_ + word_str + close_
tokens = tokenize(string) tokens = tokenize(string)
assert len(tokens) == 3 assert len(tokens) == 3
assert unhash(lex_of(tokens[0])) == open_ assert unhash(tokens[0].lex) == open_
assert unhash(lex_of(tokens[1])) == word_str assert unhash(tokens[1].lex) == word_str
assert unhash(lex_of(tokens[2])) == close_ assert unhash(tokens[2].lex) == close_
def test_two_different(paired_puncts): def test_two_different(paired_puncts):
@ -30,9 +29,9 @@ def test_two_different(paired_puncts):
string = "`" + open_ + word_str + close_ + "'" string = "`" + open_ + word_str + close_ + "'"
tokens = tokenize(string) tokens = tokenize(string)
assert len(tokens) == 5 assert len(tokens) == 5
assert unhash(lex_of(tokens[0])) == "`" assert unhash(tokens[0].lex) == "`"
assert unhash(lex_of(tokens[1])) == open_ assert unhash(tokens[1].lex) == open_
assert unhash(lex_of(tokens[2])) == word_str assert unhash(tokens[2].lex) == word_str
assert unhash(lex_of(tokens[2])) == word_str assert unhash(tokens[2].lex) == word_str
assert unhash(lex_of(tokens[3])) == close_ assert unhash(tokens[3].lex) == close_
assert unhash(lex_of(tokens[4])) == "'" assert unhash(tokens[4].lex) == "'"

View File

@ -3,8 +3,6 @@ from __future__ import unicode_literals
from spacy.en import tokenize from spacy.en import tokenize
from spacy.en import lookup from spacy.en import lookup
from spacy.lexeme import lex_of
def test_single_word(): def test_single_word():
lex_ids = tokenize(u'hello') lex_ids = tokenize(u'hello')
@ -12,33 +10,33 @@ def test_single_word():
def test_two_words(): def test_two_words():
lex_ids = tokenize(u'hello possums') words = tokenize('hello possums')
assert len(lex_ids) == 2 assert len(words) == 2
assert lex_ids[0] == lookup(u'hello') assert words[0] == lookup('hello')
assert lex_ids[0] != lex_ids[1] assert words[0] != words[1]
def test_punct(): def test_punct():
tokens = tokenize('hello, possums.') tokens = tokenize('hello, possums.')
assert len(tokens) == 4 assert len(tokens) == 4
assert lex_of(tokens[0]) == lex_of(lookup('hello')) assert tokens[0].lex == lookup('hello').lex
assert lex_of(tokens[1]) == lex_of(lookup(',')) assert tokens[1].lex == lookup(',').lex
assert lex_of(tokens[2]) == lex_of(lookup('possums')) assert tokens[2].lex == lookup('possums').lex
assert lex_of(tokens[1]) != lex_of(lookup('hello')) assert tokens[1].lex != lookup('hello').lex
def test_digits(): def test_digits():
lex_ids = tokenize('The year: 1984.') lex_ids = tokenize('The year: 1984.')
assert len(lex_ids) == 5 assert len(lex_ids) == 5
assert lex_of(lex_ids[0]) == lex_of(lookup('The')) assert lex_ids[0].lex == lookup('The').lex
assert lex_of(lex_ids[3]) == lex_of(lookup('1984')) assert lex_ids[3].lex == lookup('1984').lex
assert lex_of(lex_ids[4]) == lex_of(lookup('.')) assert lex_ids[4].lex == lookup('.').lex
def test_contraction(): def test_contraction():
lex_ids = tokenize("don't giggle") lex_ids = tokenize("don't giggle")
assert len(lex_ids) == 3 assert len(lex_ids) == 3
assert lex_of(lex_ids[1]) == lex_of(lookup("not")) assert lex_ids[1].lex == lookup("not").lex
lex_ids = tokenize("i said don't!") lex_ids = tokenize("i said don't!")
assert len(lex_ids) == 4 assert len(lex_ids) == 4
assert lex_of(lex_ids[3]) == lex_of(lookup('!')) assert lex_ids[3].lex == lookup('!').lex

View File

@ -17,7 +17,7 @@ def test_eq():
def test_round_trip(): def test_round_trip():
hello = lookup('Hello') hello = lookup('Hello')
assert unhash(lex_of(hello)) == 'Hello' assert unhash(hello.lex) == 'Hello'
def test_case_neq(): def test_case_neq():
@ -32,6 +32,6 @@ def test_punct_neq():
def test_short(): def test_short():
addr = lookup('I') addr = lookup('I')
assert unhash(lex_of(addr)) == 'I' assert unhash(addr.lex) == 'I'
addr = lookup('not') addr = lookup('not')
assert unhash(lex_of(addr)) == 'not' assert unhash(addr.lex) == 'not'