* Tests passing for new Word object version

2025-12-23 18:13:13 +03:00 · 2014-08-24 18:13:53 +02:00 · 2014-08-24 18:13:53 +02:00 · 3b793cf4f7
commit 3b793cf4f7
parent 9815c7649e
8 changed files with 6 additions and 463066 deletions
--- a/data/en_ptb/case
+++ b/data/en_ptb/case
--- a/data/en_ptb/clusters
+++ b/data/en_ptb/clusters
--- a/data/en_ptb/tokenization
+++ b/data/en_ptb/tokenization
@ -1,104 +0,0 @@
 # https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
 #  21:09, 25 June 2014
 #*--*  --
 #*---* ---
 #*'s  's
 cannot  can not
 d'ye    d' ye
 gimme   gim me
 gonna   gon na
 lemme   lem me
 more'n  more 'n
 'tis    't is
 'twas   't was
 wanna   wan na
 whaddya wha dd ya
 whatcha wha t cha
 ain't   ai n't
 aren't  are n't
 can't   can n't
 could've    could 've
 couldn't    could n't
 couldn't've could n't 've
 didn't  did n't
 doesn't does n't
 don't   do n't
 hadn't  had n't
 hadn't've   had n't 've
 hasn't  has n't
 haven't have n't
 he'd    he 'd
 he'd've he 'd 've
 he'll   he 'll
 he's    he 's
 how'd   he 'd
 how'll  he 'll
 how's   how 's
 I'd I 'd
 I'd've  I 'd 've
 I'll    I 'll
 I'm I 'm
 I've    I 've
 isn't   is n't
 it'd    it 'd
 it'd've it 'd 've
 it'll   it 'll
 it's    it 's
 let's   let 's
 mightn't    might n't
 mightn't've might n't 've
 might've    might 've
 mustn't must n't
 must've must 've
 needn't need n't
 not've  not h've
 shan't  sha n't
 she'd   she 'd
 she'd've    she 'd 've
 she'll  she 'll
 she's   she 's
 should've   should 've
 shouldn't   should n't
 shouldn't've    should n't 've
 that's  that 's
 there'd there 'd
 there'd've  there 'd 've
 there's there 's
 they'd  there 'd
 they'd've   they 'd 've
 they'll they 'll
 they're they 're
 they've they 've
 wasn't  was n't
 we'd    we 'd
 we'd've we 'd h've
 we'll   we 'll
 we're   we 're
 we've   we h've
 weren't were n't
 what'll what 'll
 what're what 're
 what's  what 's
 what've what 've
 when's  when 's
 where'd where 'd
 where's where 's
 where've    where 've
 who'd   who 'd
 who'll  who 'll
 who're  who 're
 who's   who 's
 who've  who 've
 why'll  why 'll
 why're  why 're
 why's   why 's
 won't   will n't
 would've    would 've
 wouldn't    would n't
 wouldn't've would n't 've
 you'd   you 'd
 you'd've    you 'd 've
 you'll  you 'll
 you're  you 're
 you've  you 've
--- a/data/ptb3/tokenization
+++ b/data/ptb3/tokenization
@ -2,103 +2,4 @@
 #  21:09, 25 June 2014
 #*--*  --
 #*---* ---
 #*'s  's
 cannot  can not
 d'ye    d' ye
 gimme   gim me
 gonna   gon na
 lemme   lem me
 more'n  more 'n
 'tis    't is
 'twas   't was
 wanna   wan na
 whaddya wha dd ya
 whatcha wha t cha
 ain't   ai n't
 aren't  are n't
 can't   can n't
 could've    could 've
 couldn't    could n't
 couldn't've could n't 've
 didn't  did n't
 doesn't does n't
 don't   do n't
 hadn't  had n't
 hadn't've   had n't 've
 hasn't  has n't
 haven't have n't
 he'd    he 'd
 he'd've he 'd 've
 he'll   he 'll
 he's    he 's
 how'd   he 'd
 how'll  he 'll
 how's   how 's
 I'd I 'd
 I'd've  I 'd 've
 I'll    I 'll
 I'm I 'm
 I've    I 've
 isn't   is n't
 it'd    it 'd
 it'd've it 'd 've
 it'll   it 'll
 it's    it 's
 let's   let 's
 mightn't    might n't
 mightn't've might n't 've
 might've    might 've
 mustn't must n't
 must've must 've
 needn't need n't
 not've  not h've
 shan't  sha n't
 she'd   she 'd
 she'd've    she 'd 've
 she'll  she 'll
 she's   she 's
 should've   should 've
 shouldn't   should n't
 shouldn't've    should n't 've
 that's  that 's
 there'd there 'd
 there'd've  there 'd 've
 there's there 's
 they'd  there 'd
 they'd've   they 'd 've
 they'll they 'll
 they're they 're
 they've they 've
 wasn't  was n't
 we'd    we 'd
 we'd've we 'd h've
 we'll   we 'll
 we're   we 're
 we've   we h've
 weren't were n't
 what'll what 'll
 what're what 're
 what's  what 's
 what've what 've
 when's  when 's
 where'd where 'd
 where's where 's
 where've    where 've
 who'd   who 'd
 who'll  who 'll
 who're  who 're
 who's   who 's
 who've  who 've
 why'll  why 'll
 why're  why 're
 why's   why 's
 won't   will n't
 would've    would 've
 wouldn't    would n't
 wouldn't've would n't 've
 you'd   you 'd
 you'd've    you 'd 've
 you'll  you 'll
 you're  you 're
 you've  you 've
--- a/docs/index.rst
+++ b/docs/index.rst
@ -11,8 +11,7 @@ spaCy NLP Tokenizer and Lexicon
    guide/overview
    guide/install
-    api/languages/index.rst
+    api/index.rst
    api/modules/index.rst
 Source (GitHub)
 ----------------
--- a/fabfile.py
+++ b/fabfile.py
@ -9,7 +9,6 @@ def clean():
 def docs():
    with lcd('docs'):
        local('sphinx-build -b html . ./_build')
        local('open _build/index.html')
 def test():
    local('py.test -x')
--- a/setup.py
+++ b/setup.py
@ -45,16 +45,13 @@ else:
 exts = [
-    Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.en", ["spacy/en.pyx"], language="c++",
              include_dirs=includes),
    Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
+    Extension("spacy.word", ["spacy/word.pyx"], language="c++",
              include_dirs=includes),
    Extension("spacy.orthography.latin", ["spacy/orthography/latin.pyx"], language="c++",
              include_dirs=includes)
 ]
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -5,17 +5,17 @@ scheme in several important respects:
 * Whitespace is added as tokens, except for single spaces. e.g.,
-    >>> tokenize(u'\\nHello  \\tThere').strings
+    >>> [w.string for w in tokenize(u'\\nHello  \\tThere')]
    [u'\\n', u'Hello', u' ', u'\\t', u'There']
 * Contractions are normalized, e.g.
-    >>> tokenize(u"isn't ain't won't he's").strings
+    >>> [w.string for w in u"isn't ain't won't he's")]
    [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
 * Hyphenated words are split, with the hyphen preserved, e.g.:
-    >>> tokenize(u'New York-based').strings
+    >>> [w.string for w in tokenize(u'New York-based')]
    [u'New', u'York', u'-', u'based']
 Other improvements:
@ -39,25 +39,11 @@ from __future__ import unicode_literals
 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 cimport spacy
 from spacy.orthography.latin cimport *
 from .orthography.latin import *
 from .lexeme import *
 cdef class English(spacy.Language):
    # How to ensure the order here aligns with orthography.latin?
    view_funcs = [
        get_normalized,
        get_word_shape,
        get_last3
    ]
    cdef int find_split(self, unicode word):
        cdef size_t length = len(word)
        cdef int i = 0