* Tests passing for new Word object version

This commit is contained in:
Matthew Honnibal 2014-08-24 18:13:53 +02:00
parent 9815c7649e
commit 3b793cf4f7
8 changed files with 6 additions and 463066 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,104 +0,0 @@
# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
# 21:09, 25 June 2014
#*--* --
#*---* ---
#*'s 's
cannot can not
d'ye d' ye
gimme gim me
gonna gon na
lemme lem me
more'n more 'n
'tis 't is
'twas 't was
wanna wan na
whaddya wha dd ya
whatcha wha t cha
ain't ai n't
aren't are n't
can't can n't
could've could 've
couldn't could n't
couldn't've could n't 've
didn't did n't
doesn't does n't
don't do n't
hadn't had n't
hadn't've had n't 've
hasn't has n't
haven't have n't
he'd he 'd
he'd've he 'd 've
he'll he 'll
he's he 's
how'd he 'd
how'll he 'll
how's how 's
I'd I 'd
I'd've I 'd 've
I'll I 'll
I'm I 'm
I've I 've
isn't is n't
it'd it 'd
it'd've it 'd 've
it'll it 'll
it's it 's
let's let 's
mightn't might n't
mightn't've might n't 've
might've might 've
mustn't must n't
must've must 've
needn't need n't
not've not h've
shan't sha n't
she'd she 'd
she'd've she 'd 've
she'll she 'll
she's she 's
should've should 've
shouldn't should n't
shouldn't've should n't 've
that's that 's
there'd there 'd
there'd've there 'd 've
there's there 's
they'd there 'd
they'd've they 'd 've
they'll they 'll
they're they 're
they've they 've
wasn't was n't
we'd we 'd
we'd've we 'd h've
we'll we 'll
we're we 're
we've we h've
weren't were n't
what'll what 'll
what're what 're
what's what 's
what've what 've
when's when 's
where'd where 'd
where's where 's
where've where 've
who'd who 'd
who'll who 'll
who're who 're
who's who 's
who've who 've
why'll why 'll
why're why 're
why's why 's
won't will n't
would've would 've
wouldn't would n't
wouldn't've would n't 've
you'd you 'd
you'd've you 'd 've
you'll you 'll
you're you 're
you've you 've

View File

@ -2,103 +2,4 @@
# 21:09, 25 June 2014
#*--* --
#*---* ---
#*'s 's
cannot can not
d'ye d' ye
gimme gim me
gonna gon na
lemme lem me
more'n more 'n
'tis 't is
'twas 't was
wanna wan na
whaddya wha dd ya
whatcha wha t cha
ain't ai n't
aren't are n't
can't can n't
could've could 've
couldn't could n't
couldn't've could n't 've
didn't did n't
doesn't does n't
don't do n't
hadn't had n't
hadn't've had n't 've
hasn't has n't
haven't have n't
he'd he 'd
he'd've he 'd 've
he'll he 'll
he's he 's
how'd he 'd
how'll he 'll
how's how 's
I'd I 'd
I'd've I 'd 've
I'll I 'll
I'm I 'm
I've I 've
isn't is n't
it'd it 'd
it'd've it 'd 've
it'll it 'll
it's it 's
let's let 's
mightn't might n't
mightn't've might n't 've
might've might 've
mustn't must n't
must've must 've
needn't need n't
not've not h've
shan't sha n't
she'd she 'd
she'd've she 'd 've
she'll she 'll
she's she 's
should've should 've
shouldn't should n't
shouldn't've should n't 've
that's that 's
there'd there 'd
there'd've there 'd 've
there's there 's
they'd there 'd
they'd've they 'd 've
they'll they 'll
they're they 're
they've they 've
wasn't was n't
we'd we 'd
we'd've we 'd h've
we'll we 'll
we're we 're
we've we h've
weren't were n't
what'll what 'll
what're what 're
what's what 's
what've what 've
when's when 's
where'd where 'd
where's where 's
where've where 've
who'd who 'd
who'll who 'll
who're who 're
who's who 's
who've who 've
why'll why 'll
why're why 're
why's why 's
won't will n't
would've would 've
wouldn't would n't
wouldn't've would n't 've
you'd you 'd
you'd've you 'd 've
you'll you 'll
you're you 're
you've you 've

View File

@ -11,8 +11,7 @@ spaCy NLP Tokenizer and Lexicon
guide/overview
guide/install
api/languages/index.rst
api/modules/index.rst
api/index.rst
Source (GitHub)
----------------

1
fabfile.py vendored
View File

@ -9,7 +9,6 @@ def clean():
def docs():
with lcd('docs'):
local('sphinx-build -b html . ./_build')
local('open _build/index.html')
def test():
local('py.test -x')

View File

@ -45,16 +45,13 @@ else:
exts = [
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
#Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
include_dirs=includes),
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
Extension("spacy.word", ["spacy/word.pyx"], language="c++",
include_dirs=includes),
Extension("spacy.orthography.latin", ["spacy/orthography/latin.pyx"], language="c++",
include_dirs=includes)
]

View File

@ -5,17 +5,17 @@ scheme in several important respects:
* Whitespace is added as tokens, except for single spaces. e.g.,
>>> tokenize(u'\\nHello \\tThere').strings
>>> [w.string for w in tokenize(u'\\nHello \\tThere')]
[u'\\n', u'Hello', u' ', u'\\t', u'There']
* Contractions are normalized, e.g.
>>> tokenize(u"isn't ain't won't he's").strings
>>> [w.string for w in u"isn't ain't won't he's")]
[u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
* Hyphenated words are split, with the hyphen preserved, e.g.:
>>> tokenize(u'New York-based').strings
>>> [w.string for w in tokenize(u'New York-based')]
[u'New', u'York', u'-', u'based']
Other improvements:
@ -39,25 +39,11 @@ from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
cimport spacy
from spacy.orthography.latin cimport *
from .orthography.latin import *
from .lexeme import *
cdef class English(spacy.Language):
# How to ensure the order here aligns with orthography.latin?
view_funcs = [
get_normalized,
get_word_shape,
get_last3
]
cdef int find_split(self, unicode word):
cdef size_t length = len(word)
cdef int i = 0