mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-30 01:43:21 +03:00
* Tests passing for new Word object version
This commit is contained in:
parent
9815c7649e
commit
3b793cf4f7
146129
data/en_ptb/case
146129
data/en_ptb/case
File diff suppressed because it is too large
Load Diff
316709
data/en_ptb/clusters
316709
data/en_ptb/clusters
File diff suppressed because it is too large
Load Diff
|
@ -1,104 +0,0 @@
|
||||||
# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
|
|
||||||
# 21:09, 25 June 2014
|
|
||||||
#*--* --
|
|
||||||
#*---* ---
|
|
||||||
#*'s 's
|
|
||||||
|
|
||||||
cannot can not
|
|
||||||
d'ye d' ye
|
|
||||||
gimme gim me
|
|
||||||
gonna gon na
|
|
||||||
lemme lem me
|
|
||||||
more'n more 'n
|
|
||||||
'tis 't is
|
|
||||||
'twas 't was
|
|
||||||
wanna wan na
|
|
||||||
whaddya wha dd ya
|
|
||||||
whatcha wha t cha
|
|
||||||
ain't ai n't
|
|
||||||
aren't are n't
|
|
||||||
can't can n't
|
|
||||||
could've could 've
|
|
||||||
couldn't could n't
|
|
||||||
couldn't've could n't 've
|
|
||||||
didn't did n't
|
|
||||||
doesn't does n't
|
|
||||||
don't do n't
|
|
||||||
hadn't had n't
|
|
||||||
hadn't've had n't 've
|
|
||||||
hasn't has n't
|
|
||||||
haven't have n't
|
|
||||||
he'd he 'd
|
|
||||||
he'd've he 'd 've
|
|
||||||
he'll he 'll
|
|
||||||
he's he 's
|
|
||||||
how'd he 'd
|
|
||||||
how'll he 'll
|
|
||||||
how's how 's
|
|
||||||
I'd I 'd
|
|
||||||
I'd've I 'd 've
|
|
||||||
I'll I 'll
|
|
||||||
I'm I 'm
|
|
||||||
I've I 've
|
|
||||||
isn't is n't
|
|
||||||
it'd it 'd
|
|
||||||
it'd've it 'd 've
|
|
||||||
it'll it 'll
|
|
||||||
it's it 's
|
|
||||||
let's let 's
|
|
||||||
mightn't might n't
|
|
||||||
mightn't've might n't 've
|
|
||||||
might've might 've
|
|
||||||
mustn't must n't
|
|
||||||
must've must 've
|
|
||||||
needn't need n't
|
|
||||||
not've not h've
|
|
||||||
shan't sha n't
|
|
||||||
she'd she 'd
|
|
||||||
she'd've she 'd 've
|
|
||||||
she'll she 'll
|
|
||||||
she's she 's
|
|
||||||
should've should 've
|
|
||||||
shouldn't should n't
|
|
||||||
shouldn't've should n't 've
|
|
||||||
that's that 's
|
|
||||||
there'd there 'd
|
|
||||||
there'd've there 'd 've
|
|
||||||
there's there 's
|
|
||||||
they'd there 'd
|
|
||||||
they'd've they 'd 've
|
|
||||||
they'll they 'll
|
|
||||||
they're they 're
|
|
||||||
they've they 've
|
|
||||||
wasn't was n't
|
|
||||||
we'd we 'd
|
|
||||||
we'd've we 'd h've
|
|
||||||
we'll we 'll
|
|
||||||
we're we 're
|
|
||||||
we've we h've
|
|
||||||
weren't were n't
|
|
||||||
what'll what 'll
|
|
||||||
what're what 're
|
|
||||||
what's what 's
|
|
||||||
what've what 've
|
|
||||||
when's when 's
|
|
||||||
where'd where 'd
|
|
||||||
where's where 's
|
|
||||||
where've where 've
|
|
||||||
who'd who 'd
|
|
||||||
who'll who 'll
|
|
||||||
who're who 're
|
|
||||||
who's who 's
|
|
||||||
who've who 've
|
|
||||||
why'll why 'll
|
|
||||||
why're why 're
|
|
||||||
why's why 's
|
|
||||||
won't will n't
|
|
||||||
would've would 've
|
|
||||||
wouldn't would n't
|
|
||||||
wouldn't've would n't 've
|
|
||||||
you'd you 'd
|
|
||||||
you'd've you 'd 've
|
|
||||||
you'll you 'll
|
|
||||||
you're you 're
|
|
||||||
you've you 've
|
|
|
@ -2,103 +2,4 @@
|
||||||
# 21:09, 25 June 2014
|
# 21:09, 25 June 2014
|
||||||
#*--* --
|
#*--* --
|
||||||
#*---* ---
|
#*---* ---
|
||||||
#*'s 's
|
|
||||||
|
|
||||||
cannot can not
|
|
||||||
d'ye d' ye
|
|
||||||
gimme gim me
|
|
||||||
gonna gon na
|
|
||||||
lemme lem me
|
|
||||||
more'n more 'n
|
|
||||||
'tis 't is
|
|
||||||
'twas 't was
|
|
||||||
wanna wan na
|
|
||||||
whaddya wha dd ya
|
|
||||||
whatcha wha t cha
|
|
||||||
ain't ai n't
|
|
||||||
aren't are n't
|
|
||||||
can't can n't
|
|
||||||
could've could 've
|
|
||||||
couldn't could n't
|
|
||||||
couldn't've could n't 've
|
|
||||||
didn't did n't
|
|
||||||
doesn't does n't
|
|
||||||
don't do n't
|
|
||||||
hadn't had n't
|
|
||||||
hadn't've had n't 've
|
|
||||||
hasn't has n't
|
|
||||||
haven't have n't
|
|
||||||
he'd he 'd
|
|
||||||
he'd've he 'd 've
|
|
||||||
he'll he 'll
|
|
||||||
he's he 's
|
|
||||||
how'd he 'd
|
|
||||||
how'll he 'll
|
|
||||||
how's how 's
|
|
||||||
I'd I 'd
|
|
||||||
I'd've I 'd 've
|
|
||||||
I'll I 'll
|
|
||||||
I'm I 'm
|
|
||||||
I've I 've
|
|
||||||
isn't is n't
|
|
||||||
it'd it 'd
|
|
||||||
it'd've it 'd 've
|
|
||||||
it'll it 'll
|
|
||||||
it's it 's
|
|
||||||
let's let 's
|
|
||||||
mightn't might n't
|
|
||||||
mightn't've might n't 've
|
|
||||||
might've might 've
|
|
||||||
mustn't must n't
|
|
||||||
must've must 've
|
|
||||||
needn't need n't
|
|
||||||
not've not h've
|
|
||||||
shan't sha n't
|
|
||||||
she'd she 'd
|
|
||||||
she'd've she 'd 've
|
|
||||||
she'll she 'll
|
|
||||||
she's she 's
|
|
||||||
should've should 've
|
|
||||||
shouldn't should n't
|
|
||||||
shouldn't've should n't 've
|
|
||||||
that's that 's
|
|
||||||
there'd there 'd
|
|
||||||
there'd've there 'd 've
|
|
||||||
there's there 's
|
|
||||||
they'd there 'd
|
|
||||||
they'd've they 'd 've
|
|
||||||
they'll they 'll
|
|
||||||
they're they 're
|
|
||||||
they've they 've
|
|
||||||
wasn't was n't
|
|
||||||
we'd we 'd
|
|
||||||
we'd've we 'd h've
|
|
||||||
we'll we 'll
|
|
||||||
we're we 're
|
|
||||||
we've we h've
|
|
||||||
weren't were n't
|
|
||||||
what'll what 'll
|
|
||||||
what're what 're
|
|
||||||
what's what 's
|
|
||||||
what've what 've
|
|
||||||
when's when 's
|
|
||||||
where'd where 'd
|
|
||||||
where's where 's
|
|
||||||
where've where 've
|
|
||||||
who'd who 'd
|
|
||||||
who'll who 'll
|
|
||||||
who're who 're
|
|
||||||
who's who 's
|
|
||||||
who've who 've
|
|
||||||
why'll why 'll
|
|
||||||
why're why 're
|
|
||||||
why's why 's
|
|
||||||
won't will n't
|
|
||||||
would've would 've
|
|
||||||
wouldn't would n't
|
|
||||||
wouldn't've would n't 've
|
|
||||||
you'd you 'd
|
|
||||||
you'd've you 'd 've
|
|
||||||
you'll you 'll
|
|
||||||
you're you 're
|
|
||||||
you've you 've
|
|
||||||
|
|
|
@ -11,8 +11,7 @@ spaCy NLP Tokenizer and Lexicon
|
||||||
|
|
||||||
guide/overview
|
guide/overview
|
||||||
guide/install
|
guide/install
|
||||||
api/languages/index.rst
|
api/index.rst
|
||||||
api/modules/index.rst
|
|
||||||
|
|
||||||
Source (GitHub)
|
Source (GitHub)
|
||||||
----------------
|
----------------
|
||||||
|
|
1
fabfile.py
vendored
1
fabfile.py
vendored
|
@ -9,7 +9,6 @@ def clean():
|
||||||
def docs():
|
def docs():
|
||||||
with lcd('docs'):
|
with lcd('docs'):
|
||||||
local('sphinx-build -b html . ./_build')
|
local('sphinx-build -b html . ./_build')
|
||||||
local('open _build/index.html')
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
local('py.test -x')
|
local('py.test -x')
|
||||||
|
|
7
setup.py
7
setup.py
|
@ -45,16 +45,13 @@ else:
|
||||||
|
|
||||||
|
|
||||||
exts = [
|
exts = [
|
||||||
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
#Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
|
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
|
||||||
include_dirs=includes),
|
include_dirs=includes),
|
||||||
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
|
|
||||||
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
|
Extension("spacy.word", ["spacy/word.pyx"], language="c++",
|
||||||
include_dirs=includes),
|
include_dirs=includes),
|
||||||
Extension("spacy.orthography.latin", ["spacy/orthography/latin.pyx"], language="c++",
|
|
||||||
include_dirs=includes)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
20
spacy/en.pyx
20
spacy/en.pyx
|
@ -5,17 +5,17 @@ scheme in several important respects:
|
||||||
|
|
||||||
* Whitespace is added as tokens, except for single spaces. e.g.,
|
* Whitespace is added as tokens, except for single spaces. e.g.,
|
||||||
|
|
||||||
>>> tokenize(u'\\nHello \\tThere').strings
|
>>> [w.string for w in tokenize(u'\\nHello \\tThere')]
|
||||||
[u'\\n', u'Hello', u' ', u'\\t', u'There']
|
[u'\\n', u'Hello', u' ', u'\\t', u'There']
|
||||||
|
|
||||||
* Contractions are normalized, e.g.
|
* Contractions are normalized, e.g.
|
||||||
|
|
||||||
>>> tokenize(u"isn't ain't won't he's").strings
|
>>> [w.string for w in u"isn't ain't won't he's")]
|
||||||
[u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
|
[u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
|
||||||
|
|
||||||
* Hyphenated words are split, with the hyphen preserved, e.g.:
|
* Hyphenated words are split, with the hyphen preserved, e.g.:
|
||||||
|
|
||||||
>>> tokenize(u'New York-based').strings
|
>>> [w.string for w in tokenize(u'New York-based')]
|
||||||
[u'New', u'York', u'-', u'based']
|
[u'New', u'York', u'-', u'based']
|
||||||
|
|
||||||
Other improvements:
|
Other improvements:
|
||||||
|
@ -39,25 +39,11 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.stdlib cimport malloc, calloc, free
|
from libc.stdlib cimport malloc, calloc, free
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
from libcpp.vector cimport vector
|
|
||||||
|
|
||||||
cimport spacy
|
cimport spacy
|
||||||
|
|
||||||
|
|
||||||
from spacy.orthography.latin cimport *
|
|
||||||
|
|
||||||
from .orthography.latin import *
|
|
||||||
from .lexeme import *
|
|
||||||
|
|
||||||
|
|
||||||
cdef class English(spacy.Language):
|
cdef class English(spacy.Language):
|
||||||
# How to ensure the order here aligns with orthography.latin?
|
|
||||||
view_funcs = [
|
|
||||||
get_normalized,
|
|
||||||
get_word_shape,
|
|
||||||
get_last3
|
|
||||||
]
|
|
||||||
|
|
||||||
cdef int find_split(self, unicode word):
|
cdef int find_split(self, unicode word):
|
||||||
cdef size_t length = len(word)
|
cdef size_t length = len(word)
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
|
|
Loading…
Reference in New Issue
Block a user