* Tests passing for new Word object version

2025-12-23 10:03:15 +03:00 · 2014-08-24 18:13:53 +02:00 · 2014-08-24 18:13:53 +02:00 · 3b793cf4f7
commit 3b793cf4f7
parent 9815c7649e
8 changed files with 6 additions and 463066 deletions
--- a/data/en_ptb/case
+++ b/data/en_ptb/case
--- a/data/en_ptb/clusters
+++ b/data/en_ptb/clusters
--- a/data/en_ptb/tokenization
+++ b/data/en_ptb/tokenization
@ -1,104 +0,0 @@
-# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
-#  21:09, 25 June 2014
-#*--*  --
-#*---* ---
-#*'s  's
-
-cannot  can not
-d'ye    d' ye
-gimme   gim me
-gonna   gon na
-lemme   lem me
-more'n  more 'n
-'tis    't is
-'twas   't was
-wanna   wan na
-whaddya wha dd ya
-whatcha wha t cha
-ain't   ai n't
-aren't  are n't
-can't   can n't
-could've    could 've
-couldn't    could n't
-couldn't've could n't 've
-didn't  did n't
-doesn't does n't
-don't   do n't
-hadn't  had n't
-hadn't've   had n't 've
-hasn't  has n't
-haven't have n't
-he'd    he 'd
-he'd've he 'd 've
-he'll   he 'll
-he's    he 's
-how'd   he 'd
-how'll  he 'll
-how's   how 's
-I'd I 'd
-I'd've  I 'd 've
-I'll    I 'll
-I'm I 'm
-I've    I 've
-isn't   is n't
-it'd    it 'd
-it'd've it 'd 've
-it'll   it 'll
-it's    it 's
-let's   let 's
-mightn't    might n't
-mightn't've might n't 've
-might've    might 've
-mustn't must n't
-must've must 've
-needn't need n't
-not've  not h've
-shan't  sha n't
-she'd   she 'd
-she'd've    she 'd 've
-she'll  she 'll
-she's   she 's
-should've   should 've
-shouldn't   should n't
-shouldn't've    should n't 've
-that's  that 's
-there'd there 'd
-there'd've  there 'd 've
-there's there 's
-they'd  there 'd
-they'd've   they 'd 've
-they'll they 'll
-they're they 're
-they've they 've
-wasn't  was n't
-we'd    we 'd
-we'd've we 'd h've
-we'll   we 'll
-we're   we 're
-we've   we h've
-weren't were n't
-what'll what 'll
-what're what 're
-what's  what 's
-what've what 've
-when's  when 's
-where'd where 'd
-where's where 's
-where've    where 've
-who'd   who 'd
-who'll  who 'll
-who're  who 're
-who's   who 's
-who've  who 've
-why'll  why 'll
-why're  why 're
-why's   why 's
-won't   will n't
-would've    would 've
-wouldn't    would n't
-wouldn't've would n't 've
-you'd   you 'd
-you'd've    you 'd 've
-you'll  you 'll
-you're  you 're
-you've  you 've
--- a/data/ptb3/tokenization
+++ b/data/ptb3/tokenization
@ -2,103 +2,4 @@
 #  21:09, 25 June 2014
 #*--*  --
 #*---* ---
-#*'s  's

-cannot  can not
-d'ye    d' ye
-gimme   gim me
-gonna   gon na
-lemme   lem me
-more'n  more 'n
-'tis    't is
-'twas   't was
-wanna   wan na
-whaddya wha dd ya
-whatcha wha t cha
-ain't   ai n't
-aren't  are n't
-can't   can n't
-could've    could 've
-couldn't    could n't
-couldn't've could n't 've
-didn't  did n't
-doesn't does n't
-don't   do n't
-hadn't  had n't
-hadn't've   had n't 've
-hasn't  has n't
-haven't have n't
-he'd    he 'd
-he'd've he 'd 've
-he'll   he 'll
-he's    he 's
-how'd   he 'd
-how'll  he 'll
-how's   how 's
-I'd I 'd
-I'd've  I 'd 've
-I'll    I 'll
-I'm I 'm
-I've    I 've
-isn't   is n't
-it'd    it 'd
-it'd've it 'd 've
-it'll   it 'll
-it's    it 's
-let's   let 's
-mightn't    might n't
-mightn't've might n't 've
-might've    might 've
-mustn't must n't
-must've must 've
-needn't need n't
-not've  not h've
-shan't  sha n't
-she'd   she 'd
-she'd've    she 'd 've
-she'll  she 'll
-she's   she 's
-should've   should 've
-shouldn't   should n't
-shouldn't've    should n't 've
-that's  that 's
-there'd there 'd
-there'd've  there 'd 've
-there's there 's
-they'd  there 'd
-they'd've   they 'd 've
-they'll they 'll
-they're they 're
-they've they 've
-wasn't  was n't
-we'd    we 'd
-we'd've we 'd h've
-we'll   we 'll
-we're   we 're
-we've   we h've
-weren't were n't
-what'll what 'll
-what're what 're
-what's  what 's
-what've what 've
-when's  when 's
-where'd where 'd
-where's where 's
-where've    where 've
-who'd   who 'd
-who'll  who 'll
-who're  who 're
-who's   who 's
-who've  who 've
-why'll  why 'll
-why're  why 're
-why's   why 's
-won't   will n't
-would've    would 've
-wouldn't    would n't
-wouldn't've would n't 've
-you'd   you 'd
-you'd've    you 'd 've
-you'll  you 'll
-you're  you 're
-you've  you 've
--- a/docs/index.rst
+++ b/docs/index.rst
@ -11,8 +11,7 @@ spaCy NLP Tokenizer and Lexicon
    
    guide/overview
    guide/install
-    api/languages/index.rst
-    api/modules/index.rst
+    api/index.rst

 Source (GitHub)
 ----------------
--- a/fabfile.py
+++ b/fabfile.py
@ -9,7 +9,6 @@ def clean():
 def docs():
    with lcd('docs'):
        local('sphinx-build -b html . ./_build')
-        local('open _build/index.html')

 def test():
    local('py.test -x')
--- a/setup.py
+++ b/setup.py
@ -45,16 +45,13 @@ else:


 exts = [
-    Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.en", ["spacy/en.pyx"], language="c++",
              include_dirs=includes),
    Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
+    Extension("spacy.word", ["spacy/word.pyx"], language="c++",
              include_dirs=includes),
-    Extension("spacy.orthography.latin", ["spacy/orthography/latin.pyx"], language="c++",
-              include_dirs=includes)
 ]


--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -5,17 +5,17 @@ scheme in several important respects:

 * Whitespace is added as tokens, except for single spaces. e.g.,

-    >>> tokenize(u'\\nHello  \\tThere').strings
+    >>> [w.string for w in tokenize(u'\\nHello  \\tThere')]
    [u'\\n', u'Hello', u' ', u'\\t', u'There']

 * Contractions are normalized, e.g.

-    >>> tokenize(u"isn't ain't won't he's").strings
+    >>> [w.string for w in u"isn't ain't won't he's")]
    [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
  
 * Hyphenated words are split, with the hyphen preserved, e.g.:
    
-    >>> tokenize(u'New York-based').strings
+    >>> [w.string for w in tokenize(u'New York-based')]
    [u'New', u'York', u'-', u'based']

 Other improvements:
@ -39,25 +39,11 @@ from __future__ import unicode_literals

 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
-from libcpp.vector cimport vector

 cimport spacy


-from spacy.orthography.latin cimport *
-
-from .orthography.latin import *
-from .lexeme import *
-
-
 cdef class English(spacy.Language):
-    # How to ensure the order here aligns with orthography.latin?
-    view_funcs = [
-        get_normalized,
-        get_word_shape,
-        get_last3
-    ]
-
    cdef int find_split(self, unicode word):
        cdef size_t length = len(word)
        cdef int i = 0