* Refactor around Word objects, adapting tests. Tests passing, except for string views.

2025-11-04 01:48:04 +03:00 · 2014-08-23 19:55:06 +02:00 · 2014-08-23 19:55:06 +02:00 · 9815c7649e
commit 9815c7649e
parent 4f01df9152
11 changed files with 65 additions and 117 deletions
--- a/spacy/ptb3.pxd
+++ b/spacy/ptb3.pxd
@ -1,7 +1,6 @@
 from spacy.spacy cimport Language
-from spacy.lexeme cimport LexID
-from spacy.tokens cimport Tokens
 from spacy.lexeme cimport StringHash
+from spacy.word cimport Word


 cdef class PennTreebank3(Language):
@ -10,6 +9,6 @@ cdef class PennTreebank3(Language):

 cdef PennTreebank3 PTB3

-cpdef LexID lookup(unicode word) except 0
-cpdef Tokens tokenize(unicode string)
+cpdef Word lookup(unicode word)
+cpdef list tokenize(unicode string)
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/ptb3.pyx
+++ b/spacy/ptb3.pyx
@ -77,18 +77,21 @@ def nltk_regex_tokenize(text):
 cdef class PennTreebank3(Language):
    cpdef list find_substrings(self, unicode chunk):
        strings = nltk_regex_tokenize(chunk)
+        if strings[-1] == '.':
+            strings.pop()
+            strings[-1] += '.'
        assert strings
        return strings
    

 cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')

-cpdef Tokens tokenize(unicode string):
+cpdef list tokenize(unicode string):
    return PTB3.tokenize(string)


-cpdef LexID lookup(unicode string) except 0:
-    return <LexID>PTB3.lookup(string)
+cpdef Word lookup(unicode string):
+    return PTB3.lookup(string)


 cpdef unicode unhash(StringHash hash_value):
--- a/tests/_depr_group_by.py
+++ b/tests/_depr_group_by.py
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -2,35 +2,33 @@ from __future__ import unicode_literals

 from spacy.en import tokenize, lookup, unhash

-from spacy import lex_of
-

 def test_possess():
    tokens = tokenize("Mike's")
-    assert unhash(lex_of(tokens[0])) == "Mike"
-    assert unhash(lex_of(tokens[1])) == "'s"
+    assert unhash(tokens[0].lex) == "Mike"
+    assert unhash(tokens[1].lex) == "'s"
    assert len(tokens) == 2


 def test_apostrophe():
    tokens = tokenize("schools'")
    assert len(tokens) == 2
-    assert unhash(lex_of(tokens[1])) == "'"
-    assert unhash(lex_of(tokens[0])) == "schools"
+    assert unhash(tokens[1].lex) == "'"
+    assert unhash(tokens[0].lex) == "schools"


 def test_LL():
    tokens = tokenize("we'll")
    assert len(tokens) == 2
-    assert unhash(lex_of(tokens[1])) == "will"
-    assert unhash(lex_of(tokens[0])) == "we"
+    assert unhash(tokens[1].lex) == "will"
+    assert unhash(tokens[0].lex) == "we"


 def test_aint():
    tokens = tokenize("ain't")
    assert len(tokens) == 2
-    assert unhash(lex_of(tokens[0])) == "are"
-    assert unhash(lex_of(tokens[1])) == "not"
+    assert unhash(tokens[0].lex) == "are"
+    assert unhash(tokens[1].lex) == "not"


 def test_capitalized():
@ -40,4 +38,4 @@ def test_capitalized():
    assert len(tokens) == 2
    tokens = tokenize("Ain't")
    assert len(tokens) == 2
-    assert unhash(lex_of(tokens[0])) == "Are"
+    assert unhash(tokens[0].lex) == "Are"
--- a/tests/test_orth.py
+++ b/tests/test_orth.py
@ -3,8 +3,8 @@ from __future__ import unicode_literals
 import pytest

 from spacy.en import lookup, unhash
+import spacy.word

-from spacy.en import lex_of, shape_of, norm_of, first_of, length_of

@pytest.fixture
 def C3P0():
@ -12,17 +12,16 @@ def C3P0():


 def test_shape(C3P0):
-    assert unhash(shape_of(C3P0)) == "XdXd"
+    # TODO: Fix this
+    assert unhash(C3P0.get_view(2)) == "XdXd"


 def test_length():
    t = lookup('the')
-    assert length_of(t) == 3
-    #t = lookup('')
-    #assert length_of(t) == 0
+    assert t.length == 3
    t = lookup("n't")
-    assert length_of(t) == 3
+    assert t.length == 3
    t = lookup("'s")
-    assert length_of(t) == 2
+    assert t.length == 2
    t = lookup('Xxxx')
-    assert length_of(t) == 4
+    assert t.length == 4
--- a/tests/test_post_punct.py
+++ b/tests/test_post_punct.py
@ -1,6 +1,5 @@
 from __future__ import unicode_literals

-from spacy import lex_of
 from spacy.en import lookup
 from spacy.en import tokenize
 from spacy.en import unhash
@ -19,8 +18,8 @@ def test_close(close_puncts):
        string = word_str + p
        tokens = tokenize(string)
        assert len(tokens) == 2
-        assert unhash(lex_of(tokens[1])) == p
-        assert unhash(lex_of(tokens[0])) == word_str
+        assert unhash(tokens[1].lex) == p
+        assert unhash(tokens[0].lex) == word_str


 def test_two_different_close(close_puncts):
@ -29,9 +28,9 @@ def test_two_different_close(close_puncts):
        string = word_str + p + "'"
        tokens = tokenize(string)
        assert len(tokens) == 3
-        assert unhash(lex_of(tokens[0])) == word_str
-        assert unhash(lex_of(tokens[1])) == p
-        assert unhash(lex_of(tokens[2])) == "'"
+        assert unhash(tokens[0].lex) == word_str
+        assert unhash(tokens[1].lex) == p
+        assert unhash(tokens[2].lex) == "'"


 def test_three_same_close(close_puncts):
@ -40,5 +39,5 @@ def test_three_same_close(close_puncts):
        string = word_str + p + p + p
        tokens = tokenize(string)
        assert len(tokens) == 4
-        assert unhash(lex_of(tokens[0])) == word_str
-        assert unhash(lex_of(tokens[1])) == p
+        assert unhash(tokens[0].lex) == word_str
+        assert unhash(tokens[1].lex) == p
--- a/tests/test_pre_punct.py
+++ b/tests/test_pre_punct.py
@ -1,6 +1,5 @@
 from __future__ import unicode_literals

-from spacy import lex_of
 from spacy.en import lookup
 from spacy.en import tokenize
 from spacy.en import unhash
@ -19,8 +18,8 @@ def test_open(open_puncts):
        string = p + word_str
        tokens = tokenize(string)
        assert len(tokens) == 2
-        assert unhash(lex_of(tokens[0])) == p
-        assert unhash(lex_of(tokens[1])) == word_str
+        assert unhash(tokens[0].lex) == p
+        assert unhash(tokens[1].lex) == word_str


 def test_two_different_open(open_puncts):
@ -29,9 +28,9 @@ def test_two_different_open(open_puncts):
        string = p + "`" + word_str
        tokens = tokenize(string)
        assert len(tokens) == 3
-        assert unhash(lex_of(tokens[0])) == p
-        assert unhash(lex_of(tokens[1])) == "`"
-        assert unhash(lex_of(tokens[2])) == word_str
+        assert unhash(tokens[0].lex) == p
+        assert unhash(tokens[1].lex) == "`"
+        assert unhash(tokens[2].lex) == word_str


 def test_three_same_open(open_puncts):
@ -40,12 +39,12 @@ def test_three_same_open(open_puncts):
        string = p + p + p + word_str
        tokens = tokenize(string)
        assert len(tokens) == 4
-        assert unhash(lex_of(tokens[0])) == p
-        assert unhash(lex_of(tokens[3])) == word_str
+        assert unhash(tokens[0].lex) == p
+        assert unhash(tokens[3].lex) == word_str


 def test_open_appostrophe():
    string = "'The"
    tokens = tokenize(string)
    assert len(tokens) == 2
-    assert unhash(lex_of(tokens[0])) == "'"
+    assert unhash(tokens[0].lex) == "'"
--- a/tests/test_ptb_match_wiki_sun.py
+++ b/tests/test_ptb_match_wiki_sun.py
@ -1,46 +0,0 @@
-from __future__ import unicode_literals
-
-from spacy.en import unhash
-from spacy import lex_of
-from spacy.util import utf8open
-from spacy.ptb3 import tokenize, lookup, unhash
-
-import pytest
-import os
-from os import path
-
-
-HERE = path.dirname(__file__)
-
-
-@pytest.fixture
-def sun_txt():
-    loc = path.join(HERE, 'sun.txt')
-    return utf8open(loc).read()
-
-
-@pytest.fixture
-def my_tokens(sun_txt):
-    assert len(sun_txt) != 0
-    tokens = tokenize(sun_txt)
-    return [unhash(lex_of(t)) for t in tokens]
-
-
-@pytest.fixture
-def sed_tokens():
-    loc = path.join(HERE, 'sun.tokens')
-    return utf8open(loc).read().split()
-
-
-def test_compare_tokens(my_tokens, sed_tokens):
-    me = my_tokens
-    sed = sed_tokens
-    i = 0
-    while i < len(me) and i < len(sed):
-        assert me[i] == sed[i]
-        i += 1
-
-    assert len(me) == len(sed)
-
-
-
--- a/tests/test_surround_punct.py
+++ b/tests/test_surround_punct.py
@ -1,6 +1,5 @@
 from __future__ import unicode_literals

-from spacy import lex_of
 from spacy.en import tokenize
 from spacy.en import lookup
 from spacy.en import unhash
@ -19,9 +18,9 @@ def test_token(paired_puncts):
        string = open_ + word_str + close_
        tokens = tokenize(string)
        assert len(tokens) == 3
-        assert unhash(lex_of(tokens[0])) == open_
-        assert unhash(lex_of(tokens[1])) == word_str
-        assert unhash(lex_of(tokens[2])) == close_
+        assert unhash(tokens[0].lex) == open_
+        assert unhash(tokens[1].lex) == word_str
+        assert unhash(tokens[2].lex) == close_


 def test_two_different(paired_puncts):
@ -30,9 +29,9 @@ def test_two_different(paired_puncts):
        string = "`" + open_ + word_str + close_ + "'"
        tokens = tokenize(string)
        assert len(tokens) == 5
-        assert unhash(lex_of(tokens[0])) == "`"
-        assert unhash(lex_of(tokens[1])) == open_
-        assert unhash(lex_of(tokens[2])) == word_str
-        assert unhash(lex_of(tokens[2])) == word_str
-        assert unhash(lex_of(tokens[3])) == close_
-        assert unhash(lex_of(tokens[4])) == "'"
+        assert unhash(tokens[0].lex) == "`"
+        assert unhash(tokens[1].lex) == open_
+        assert unhash(tokens[2].lex) == word_str
+        assert unhash(tokens[2].lex) == word_str
+        assert unhash(tokens[3].lex) == close_
+        assert unhash(tokens[4].lex) == "'"
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -3,8 +3,6 @@ from __future__ import unicode_literals
 from spacy.en import tokenize
 from spacy.en import lookup

-from spacy.lexeme import lex_of
-

 def test_single_word():
    lex_ids = tokenize(u'hello')
@ -12,33 +10,33 @@ def test_single_word():


 def test_two_words():
-    lex_ids = tokenize(u'hello possums')
-    assert len(lex_ids) == 2
-    assert lex_ids[0] == lookup(u'hello')
-    assert lex_ids[0] != lex_ids[1]
+    words = tokenize('hello possums')
+    assert len(words) == 2
+    assert words[0] == lookup('hello')
+    assert words[0] != words[1]


 def test_punct():
    tokens = tokenize('hello, possums.')
    assert len(tokens) == 4
-    assert lex_of(tokens[0]) == lex_of(lookup('hello'))
-    assert lex_of(tokens[1]) == lex_of(lookup(','))
-    assert lex_of(tokens[2]) == lex_of(lookup('possums'))
-    assert lex_of(tokens[1]) != lex_of(lookup('hello'))
+    assert tokens[0].lex == lookup('hello').lex
+    assert tokens[1].lex == lookup(',').lex
+    assert tokens[2].lex == lookup('possums').lex
+    assert tokens[1].lex != lookup('hello').lex


 def test_digits():
    lex_ids = tokenize('The year: 1984.')
    assert len(lex_ids) == 5
-    assert lex_of(lex_ids[0]) == lex_of(lookup('The'))
-    assert lex_of(lex_ids[3]) == lex_of(lookup('1984'))
-    assert lex_of(lex_ids[4]) == lex_of(lookup('.'))
+    assert lex_ids[0].lex == lookup('The').lex
+    assert lex_ids[3].lex == lookup('1984').lex
+    assert lex_ids[4].lex == lookup('.').lex


 def test_contraction():
    lex_ids = tokenize("don't giggle")
    assert len(lex_ids) == 3
-    assert lex_of(lex_ids[1]) == lex_of(lookup("not"))
+    assert lex_ids[1].lex == lookup("not").lex
    lex_ids = tokenize("i said don't!")
    assert len(lex_ids) == 4
-    assert lex_of(lex_ids[3]) == lex_of(lookup('!'))
+    assert lex_ids[3].lex == lookup('!').lex
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -17,7 +17,7 @@ def test_eq():

 def test_round_trip():
    hello = lookup('Hello')
-    assert unhash(lex_of(hello)) == 'Hello'
+    assert unhash(hello.lex) == 'Hello'


 def test_case_neq():
@ -32,6 +32,6 @@ def test_punct_neq():

 def test_short():
    addr = lookup('I')
-    assert unhash(lex_of(addr)) == 'I'
+    assert unhash(addr.lex) == 'I'
    addr = lookup('not')
-    assert unhash(lex_of(addr)) == 'not'
+    assert unhash(addr.lex) == 'not'