mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
* Tests passing for new Word object version
This commit is contained in:
parent
9815c7649e
commit
3b793cf4f7
146129
data/en_ptb/case
146129
data/en_ptb/case
File diff suppressed because it is too large
Load Diff
316709
data/en_ptb/clusters
316709
data/en_ptb/clusters
File diff suppressed because it is too large
Load Diff
|
@ -1,104 +0,0 @@
|
|||
# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
|
||||
# 21:09, 25 June 2014
|
||||
#*--* --
|
||||
#*---* ---
|
||||
#*'s 's
|
||||
|
||||
cannot can not
|
||||
d'ye d' ye
|
||||
gimme gim me
|
||||
gonna gon na
|
||||
lemme lem me
|
||||
more'n more 'n
|
||||
'tis 't is
|
||||
'twas 't was
|
||||
wanna wan na
|
||||
whaddya wha dd ya
|
||||
whatcha wha t cha
|
||||
ain't ai n't
|
||||
aren't are n't
|
||||
can't can n't
|
||||
could've could 've
|
||||
couldn't could n't
|
||||
couldn't've could n't 've
|
||||
didn't did n't
|
||||
doesn't does n't
|
||||
don't do n't
|
||||
hadn't had n't
|
||||
hadn't've had n't 've
|
||||
hasn't has n't
|
||||
haven't have n't
|
||||
he'd he 'd
|
||||
he'd've he 'd 've
|
||||
he'll he 'll
|
||||
he's he 's
|
||||
how'd he 'd
|
||||
how'll he 'll
|
||||
how's how 's
|
||||
I'd I 'd
|
||||
I'd've I 'd 've
|
||||
I'll I 'll
|
||||
I'm I 'm
|
||||
I've I 've
|
||||
isn't is n't
|
||||
it'd it 'd
|
||||
it'd've it 'd 've
|
||||
it'll it 'll
|
||||
it's it 's
|
||||
let's let 's
|
||||
mightn't might n't
|
||||
mightn't've might n't 've
|
||||
might've might 've
|
||||
mustn't must n't
|
||||
must've must 've
|
||||
needn't need n't
|
||||
not've not h've
|
||||
shan't sha n't
|
||||
she'd she 'd
|
||||
she'd've she 'd 've
|
||||
she'll she 'll
|
||||
she's she 's
|
||||
should've should 've
|
||||
shouldn't should n't
|
||||
shouldn't've should n't 've
|
||||
that's that 's
|
||||
there'd there 'd
|
||||
there'd've there 'd 've
|
||||
there's there 's
|
||||
they'd there 'd
|
||||
they'd've they 'd 've
|
||||
they'll they 'll
|
||||
they're they 're
|
||||
they've they 've
|
||||
wasn't was n't
|
||||
we'd we 'd
|
||||
we'd've we 'd h've
|
||||
we'll we 'll
|
||||
we're we 're
|
||||
we've we h've
|
||||
weren't were n't
|
||||
what'll what 'll
|
||||
what're what 're
|
||||
what's what 's
|
||||
what've what 've
|
||||
when's when 's
|
||||
where'd where 'd
|
||||
where's where 's
|
||||
where've where 've
|
||||
who'd who 'd
|
||||
who'll who 'll
|
||||
who're who 're
|
||||
who's who 's
|
||||
who've who 've
|
||||
why'll why 'll
|
||||
why're why 're
|
||||
why's why 's
|
||||
won't will n't
|
||||
would've would 've
|
||||
wouldn't would n't
|
||||
wouldn't've would n't 've
|
||||
you'd you 'd
|
||||
you'd've you 'd 've
|
||||
you'll you 'll
|
||||
you're you 're
|
||||
you've you 've
|
|
@ -2,103 +2,4 @@
|
|||
# 21:09, 25 June 2014
|
||||
#*--* --
|
||||
#*---* ---
|
||||
#*'s 's
|
||||
|
||||
cannot can not
|
||||
d'ye d' ye
|
||||
gimme gim me
|
||||
gonna gon na
|
||||
lemme lem me
|
||||
more'n more 'n
|
||||
'tis 't is
|
||||
'twas 't was
|
||||
wanna wan na
|
||||
whaddya wha dd ya
|
||||
whatcha wha t cha
|
||||
ain't ai n't
|
||||
aren't are n't
|
||||
can't can n't
|
||||
could've could 've
|
||||
couldn't could n't
|
||||
couldn't've could n't 've
|
||||
didn't did n't
|
||||
doesn't does n't
|
||||
don't do n't
|
||||
hadn't had n't
|
||||
hadn't've had n't 've
|
||||
hasn't has n't
|
||||
haven't have n't
|
||||
he'd he 'd
|
||||
he'd've he 'd 've
|
||||
he'll he 'll
|
||||
he's he 's
|
||||
how'd he 'd
|
||||
how'll he 'll
|
||||
how's how 's
|
||||
I'd I 'd
|
||||
I'd've I 'd 've
|
||||
I'll I 'll
|
||||
I'm I 'm
|
||||
I've I 've
|
||||
isn't is n't
|
||||
it'd it 'd
|
||||
it'd've it 'd 've
|
||||
it'll it 'll
|
||||
it's it 's
|
||||
let's let 's
|
||||
mightn't might n't
|
||||
mightn't've might n't 've
|
||||
might've might 've
|
||||
mustn't must n't
|
||||
must've must 've
|
||||
needn't need n't
|
||||
not've not h've
|
||||
shan't sha n't
|
||||
she'd she 'd
|
||||
she'd've she 'd 've
|
||||
she'll she 'll
|
||||
she's she 's
|
||||
should've should 've
|
||||
shouldn't should n't
|
||||
shouldn't've should n't 've
|
||||
that's that 's
|
||||
there'd there 'd
|
||||
there'd've there 'd 've
|
||||
there's there 's
|
||||
they'd there 'd
|
||||
they'd've they 'd 've
|
||||
they'll they 'll
|
||||
they're they 're
|
||||
they've they 've
|
||||
wasn't was n't
|
||||
we'd we 'd
|
||||
we'd've we 'd h've
|
||||
we'll we 'll
|
||||
we're we 're
|
||||
we've we h've
|
||||
weren't were n't
|
||||
what'll what 'll
|
||||
what're what 're
|
||||
what's what 's
|
||||
what've what 've
|
||||
when's when 's
|
||||
where'd where 'd
|
||||
where's where 's
|
||||
where've where 've
|
||||
who'd who 'd
|
||||
who'll who 'll
|
||||
who're who 're
|
||||
who's who 's
|
||||
who've who 've
|
||||
why'll why 'll
|
||||
why're why 're
|
||||
why's why 's
|
||||
won't will n't
|
||||
would've would 've
|
||||
wouldn't would n't
|
||||
wouldn't've would n't 've
|
||||
you'd you 'd
|
||||
you'd've you 'd 've
|
||||
you'll you 'll
|
||||
you're you 're
|
||||
you've you 've
|
||||
|
|
|
@ -11,8 +11,7 @@ spaCy NLP Tokenizer and Lexicon
|
|||
|
||||
guide/overview
|
||||
guide/install
|
||||
api/languages/index.rst
|
||||
api/modules/index.rst
|
||||
api/index.rst
|
||||
|
||||
Source (GitHub)
|
||||
----------------
|
||||
|
|
1
fabfile.py
vendored
1
fabfile.py
vendored
|
@ -9,7 +9,6 @@ def clean():
|
|||
def docs():
|
||||
with lcd('docs'):
|
||||
local('sphinx-build -b html . ./_build')
|
||||
local('open _build/index.html')
|
||||
|
||||
def test():
|
||||
local('py.test -x')
|
||||
|
|
7
setup.py
7
setup.py
|
@ -45,16 +45,13 @@ else:
|
|||
|
||||
|
||||
exts = [
|
||||
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
|
||||
include_dirs=includes),
|
||||
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
|
||||
Extension("spacy.word", ["spacy/word.pyx"], language="c++",
|
||||
include_dirs=includes),
|
||||
Extension("spacy.orthography.latin", ["spacy/orthography/latin.pyx"], language="c++",
|
||||
include_dirs=includes)
|
||||
]
|
||||
|
||||
|
||||
|
|
20
spacy/en.pyx
20
spacy/en.pyx
|
@ -5,17 +5,17 @@ scheme in several important respects:
|
|||
|
||||
* Whitespace is added as tokens, except for single spaces. e.g.,
|
||||
|
||||
>>> tokenize(u'\\nHello \\tThere').strings
|
||||
>>> [w.string for w in tokenize(u'\\nHello \\tThere')]
|
||||
[u'\\n', u'Hello', u' ', u'\\t', u'There']
|
||||
|
||||
* Contractions are normalized, e.g.
|
||||
|
||||
>>> tokenize(u"isn't ain't won't he's").strings
|
||||
>>> [w.string for w in u"isn't ain't won't he's")]
|
||||
[u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
|
||||
|
||||
* Hyphenated words are split, with the hyphen preserved, e.g.:
|
||||
|
||||
>>> tokenize(u'New York-based').strings
|
||||
>>> [w.string for w in tokenize(u'New York-based')]
|
||||
[u'New', u'York', u'-', u'based']
|
||||
|
||||
Other improvements:
|
||||
|
@ -39,25 +39,11 @@ from __future__ import unicode_literals
|
|||
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
from libc.stdint cimport uint64_t
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
cimport spacy
|
||||
|
||||
|
||||
from spacy.orthography.latin cimport *
|
||||
|
||||
from .orthography.latin import *
|
||||
from .lexeme import *
|
||||
|
||||
|
||||
cdef class English(spacy.Language):
|
||||
# How to ensure the order here aligns with orthography.latin?
|
||||
view_funcs = [
|
||||
get_normalized,
|
||||
get_word_shape,
|
||||
get_last3
|
||||
]
|
||||
|
||||
cdef int find_split(self, unicode word):
|
||||
cdef size_t length = len(word)
|
||||
cdef int i = 0
|
||||
|
|
Loading…
Reference in New Issue
Block a user