* Tests passing for new Word object version

This commit is contained in:
Matthew Honnibal 2014-08-24 18:13:53 +02:00
parent 9815c7649e
commit 3b793cf4f7
8 changed files with 6 additions and 463066 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,104 +0,0 @@
# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
# 21:09, 25 June 2014
#*--* --
#*---* ---
#*'s 's
cannot can not
d'ye d' ye
gimme gim me
gonna gon na
lemme lem me
more'n more 'n
'tis 't is
'twas 't was
wanna wan na
whaddya wha dd ya
whatcha wha t cha
ain't ai n't
aren't are n't
can't can n't
could've could 've
couldn't could n't
couldn't've could n't 've
didn't did n't
doesn't does n't
don't do n't
hadn't had n't
hadn't've had n't 've
hasn't has n't
haven't have n't
he'd he 'd
he'd've he 'd 've
he'll he 'll
he's he 's
how'd he 'd
how'll he 'll
how's how 's
I'd I 'd
I'd've I 'd 've
I'll I 'll
I'm I 'm
I've I 've
isn't is n't
it'd it 'd
it'd've it 'd 've
it'll it 'll
it's it 's
let's let 's
mightn't might n't
mightn't've might n't 've
might've might 've
mustn't must n't
must've must 've
needn't need n't
not've not h've
shan't sha n't
she'd she 'd
she'd've she 'd 've
she'll she 'll
she's she 's
should've should 've
shouldn't should n't
shouldn't've should n't 've
that's that 's
there'd there 'd
there'd've there 'd 've
there's there 's
they'd there 'd
they'd've they 'd 've
they'll they 'll
they're they 're
they've they 've
wasn't was n't
we'd we 'd
we'd've we 'd h've
we'll we 'll
we're we 're
we've we h've
weren't were n't
what'll what 'll
what're what 're
what's what 's
what've what 've
when's when 's
where'd where 'd
where's where 's
where've where 've
who'd who 'd
who'll who 'll
who're who 're
who's who 's
who've who 've
why'll why 'll
why're why 're
why's why 's
won't will n't
would've would 've
wouldn't would n't
wouldn't've would n't 've
you'd you 'd
you'd've you 'd 've
you'll you 'll
you're you 're
you've you 've

View File

@ -2,103 +2,4 @@
# 21:09, 25 June 2014 # 21:09, 25 June 2014
#*--* -- #*--* --
#*---* --- #*---* ---
#*'s 's
cannot can not
d'ye d' ye
gimme gim me
gonna gon na
lemme lem me
more'n more 'n
'tis 't is
'twas 't was
wanna wan na
whaddya wha dd ya
whatcha wha t cha
ain't ai n't
aren't are n't
can't can n't
could've could 've
couldn't could n't
couldn't've could n't 've
didn't did n't
doesn't does n't
don't do n't
hadn't had n't
hadn't've had n't 've
hasn't has n't
haven't have n't
he'd he 'd
he'd've he 'd 've
he'll he 'll
he's he 's
how'd he 'd
how'll he 'll
how's how 's
I'd I 'd
I'd've I 'd 've
I'll I 'll
I'm I 'm
I've I 've
isn't is n't
it'd it 'd
it'd've it 'd 've
it'll it 'll
it's it 's
let's let 's
mightn't might n't
mightn't've might n't 've
might've might 've
mustn't must n't
must've must 've
needn't need n't
not've not h've
shan't sha n't
she'd she 'd
she'd've she 'd 've
she'll she 'll
she's she 's
should've should 've
shouldn't should n't
shouldn't've should n't 've
that's that 's
there'd there 'd
there'd've there 'd 've
there's there 's
they'd there 'd
they'd've they 'd 've
they'll they 'll
they're they 're
they've they 've
wasn't was n't
we'd we 'd
we'd've we 'd h've
we'll we 'll
we're we 're
we've we h've
weren't were n't
what'll what 'll
what're what 're
what's what 's
what've what 've
when's when 's
where'd where 'd
where's where 's
where've where 've
who'd who 'd
who'll who 'll
who're who 're
who's who 's
who've who 've
why'll why 'll
why're why 're
why's why 's
won't will n't
would've would 've
wouldn't would n't
wouldn't've would n't 've
you'd you 'd
you'd've you 'd 've
you'll you 'll
you're you 're
you've you 've

View File

@ -11,8 +11,7 @@ spaCy NLP Tokenizer and Lexicon
guide/overview guide/overview
guide/install guide/install
api/languages/index.rst api/index.rst
api/modules/index.rst
Source (GitHub) Source (GitHub)
---------------- ----------------

1
fabfile.py vendored
View File

@ -9,7 +9,6 @@ def clean():
def docs(): def docs():
with lcd('docs'): with lcd('docs'):
local('sphinx-build -b html . ./_build') local('sphinx-build -b html . ./_build')
local('open _build/index.html')
def test(): def test():
local('py.test -x') local('py.test -x')

View File

@ -45,16 +45,13 @@ else:
exts = [ exts = [
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), #Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
Extension("spacy.en", ["spacy/en.pyx"], language="c++", Extension("spacy.en", ["spacy/en.pyx"], language="c++",
include_dirs=includes), include_dirs=includes),
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes), Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes), Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++", Extension("spacy.word", ["spacy/word.pyx"], language="c++",
include_dirs=includes), include_dirs=includes),
Extension("spacy.orthography.latin", ["spacy/orthography/latin.pyx"], language="c++",
include_dirs=includes)
] ]

View File

@ -5,17 +5,17 @@ scheme in several important respects:
* Whitespace is added as tokens, except for single spaces. e.g., * Whitespace is added as tokens, except for single spaces. e.g.,
>>> tokenize(u'\\nHello \\tThere').strings >>> [w.string for w in tokenize(u'\\nHello \\tThere')]
[u'\\n', u'Hello', u' ', u'\\t', u'There'] [u'\\n', u'Hello', u' ', u'\\t', u'There']
* Contractions are normalized, e.g. * Contractions are normalized, e.g.
>>> tokenize(u"isn't ain't won't he's").strings >>> [w.string for w in u"isn't ain't won't he's")]
[u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"] [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
* Hyphenated words are split, with the hyphen preserved, e.g.: * Hyphenated words are split, with the hyphen preserved, e.g.:
>>> tokenize(u'New York-based').strings >>> [w.string for w in tokenize(u'New York-based')]
[u'New', u'York', u'-', u'based'] [u'New', u'York', u'-', u'based']
Other improvements: Other improvements:
@ -39,25 +39,11 @@ from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
cimport spacy cimport spacy
from spacy.orthography.latin cimport *
from .orthography.latin import *
from .lexeme import *
cdef class English(spacy.Language): cdef class English(spacy.Language):
# How to ensure the order here aligns with orthography.latin?
view_funcs = [
get_normalized,
get_word_shape,
get_last3
]
cdef int find_split(self, unicode word): cdef int find_split(self, unicode word):
cdef size_t length = len(word) cdef size_t length = len(word)
cdef int i = 0 cdef int i = 0