Merge branch 'master' of ssh://github.com/explosion/spaCy

This commit is contained in:
Matthew Honnibal 2017-01-09 13:21:56 +01:00
commit 3eb6a929f3
84 changed files with 1867 additions and 3255 deletions

View File

@ -33,6 +33,7 @@ We use the following system to tag our issues:
| [`install`](https://github.com/explosion/spaCy/labels/install) | Installation problems | | [`install`](https://github.com/explosion/spaCy/labels/install) | Installation problems |
| [`performance`](https://github.com/explosion/spaCy/labels/performance) | Accuracy, speed and memory use problems | | [`performance`](https://github.com/explosion/spaCy/labels/performance) | Accuracy, speed and memory use problems |
| [`tests`](https://github.com/explosion/spaCy/labels/tests) | Missing or incorrect [tests](spacy/tests) | | [`tests`](https://github.com/explosion/spaCy/labels/tests) | Missing or incorrect [tests](spacy/tests) |
| [`english`](https://github.com/explosion/spaCy/labels/english), [`german`](https://github.com/explosion/spaCy/labels/german) | Issues related to the specific languages, models and data |
| [`linux`](https://github.com/explosion/spaCy/labels/linux), [`osx`](https://github.com/explosion/spaCy/labels/osx), [`windows`](https://github.com/explosion/spaCy/labels/windows) | Issues related to the specific operating systems | | [`linux`](https://github.com/explosion/spaCy/labels/linux), [`osx`](https://github.com/explosion/spaCy/labels/osx), [`windows`](https://github.com/explosion/spaCy/labels/windows) | Issues related to the specific operating systems |
| [`pip`](https://github.com/explosion/spaCy/labels/pip), [`conda`](https://github.com/explosion/spaCy/labels/conda) | Issues related to the specific package managers | | [`pip`](https://github.com/explosion/spaCy/labels/pip), [`conda`](https://github.com/explosion/spaCy/labels/conda) | Issues related to the specific package managers |
| [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before | | [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before |

View File

@ -3,8 +3,10 @@ spaCy: Industrial-strength NLP
spaCy is a library for advanced natural language processing in Python and spaCy is a library for advanced natural language processing in Python and
Cython. spaCy is built on the very latest research, but it isn't researchware. Cython. spaCy is built on the very latest research, but it isn't researchware.
It was designed from day 1 to be used in real products. It's commercial It was designed from day one to be used in real products. spaCy currently supports
open-source software, released under the MIT license. English and German, as well as tokenization for Chinese, Spanish, Italian, French,
Portuguese, Dutch, Swedish and Hungarian. It's commercial open-source software,
released under the MIT license.
💫 **Version 1.5 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_ 💫 **Version 1.5 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
@ -24,7 +26,7 @@ open-source software, released under the MIT license.
:target: https://pypi.python.org/pypi/spacy :target: https://pypi.python.org/pypi/spacy
:alt: pypi Version :alt: pypi Version
.. image:: https://badges.gitter.im/spaCy-users.png .. image:: https://badges.gitter.im/explosion.png
:target: https://gitter.im/explosion/spaCy :target: https://gitter.im/explosion/spaCy
:alt: spaCy on Gitter :alt: spaCy on Gitter

View File

@ -71,6 +71,8 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc):
features = get_templates('basic') features = get_templates('basic')
model_dir = pathlib.Path(model_dir) model_dir = pathlib.Path(model_dir)
if not (model_dir / 'deps').exists():
(model_dir / 'deps').mkdir()
with (model_dir / 'deps' / 'config.json').open('w') as file_: with (model_dir / 'deps' / 'config.json').open('w') as file_:
json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_) json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_)

View File

@ -47,8 +47,7 @@ PACKAGES = [
'spacy.tests.tokenizer', 'spacy.tests.tokenizer',
'spacy.tests.tokens', 'spacy.tests.tokens',
'spacy.tests.vectors', 'spacy.tests.vectors',
'spacy.tests.vocab', 'spacy.tests.vocab']
'spacy.tests.website']
MOD_NAMES = [ MOD_NAMES = [

View File

@ -9,12 +9,13 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
TAG_MAP = dict(TAG_MAP) TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))

View File

@ -516,11 +516,6 @@ TOKENIZER_EXCEPTIONS = {
ORTH_ONLY = [ ORTH_ONLY = [
"'",
"\\\")",
"<space>",
"a.",
"ä.",
"A.C.", "A.C.",
"a.D.", "a.D.",
"A.D.", "A.D.",
@ -530,24 +525,20 @@ ORTH_ONLY = [
"Abs.", "Abs.",
"adv.", "adv.",
"al.", "al.",
"b.",
"B.A.", "B.A.",
"B.Sc.", "B.Sc.",
"betr.", "betr.",
"biol.", "biol.",
"Biol.", "Biol.",
"c.",
"ca.", "ca.",
"Chr.", "Chr.",
"Cie.", "Cie.",
"co.", "co.",
"Co.", "Co.",
"d.",
"D.C.", "D.C.",
"Dipl.-Ing.", "Dipl.-Ing.",
"Dipl.", "Dipl.",
"Dr.", "Dr.",
"e.",
"e.g.", "e.g.",
"e.V.", "e.V.",
"ehem.", "ehem.",
@ -555,79 +546,57 @@ ORTH_ONLY = [
"erm.", "erm.",
"etc.", "etc.",
"ev.", "ev.",
"f.",
"g.",
"G.m.b.H.", "G.m.b.H.",
"geb.", "geb.",
"Gebr.", "Gebr.",
"gem.", "gem.",
"h.",
"h.c.", "h.c.",
"Hg.", "Hg.",
"hrsg.", "hrsg.",
"Hrsg.", "Hrsg.",
"i.",
"i.A.", "i.A.",
"i.e.", "i.e.",
"i.G.", "i.G.",
"i.Tr.", "i.Tr.",
"i.V.", "i.V.",
"Ing.", "Ing.",
"j.",
"jr.", "jr.",
"Jr.", "Jr.",
"jun.", "jun.",
"jur.", "jur.",
"k.",
"K.O.", "K.O.",
"l.",
"L.A.", "L.A.",
"lat.", "lat.",
"m.",
"M.A.", "M.A.",
"m.E.", "m.E.",
"m.M.", "m.M.",
"M.Sc.", "M.Sc.",
"Mr.", "Mr.",
"n.",
"N.Y.", "N.Y.",
"N.Y.C.", "N.Y.C.",
"nat.", "nat.",
"ö." "ö."
"o.",
"o.a.", "o.a.",
"o.ä.", "o.ä.",
"o.g.", "o.g.",
"o.k.", "o.k.",
"O.K.", "O.K.",
"p.",
"p.a.", "p.a.",
"p.s.", "p.s.",
"P.S.", "P.S.",
"pers.", "pers.",
"phil.", "phil.",
"q.",
"q.e.d.", "q.e.d.",
"r.",
"R.I.P.", "R.I.P.",
"rer.", "rer.",
"s.",
"sen.", "sen.",
"St.", "St.",
"std.", "std.",
"t.",
"u.",
"ü.",
"u.a.", "u.a.",
"U.S.", "U.S.",
"U.S.A.", "U.S.A.",
"U.S.S.", "U.S.S.",
"v.",
"Vol.", "Vol.",
"vs.", "vs.",
"w.", "wiss."
"wiss.",
"x.",
"y.",
"z."
] ]

View File

@ -37,14 +37,16 @@ def get_time_exc(hours):
return exc return exc
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
TAG_MAP = dict(TAG_MAP) TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "")) update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", ""))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"] __all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"]

File diff suppressed because it is too large Load Diff

View File

@ -40,11 +40,14 @@ def get_time_exc(hours):
return exc return exc
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -85,55 +85,29 @@ TOKENIZER_EXCEPTIONS = {
ORTH_ONLY = [ ORTH_ONLY = [
"a.",
"a.C.", "a.C.",
"a.J.C.", "a.J.C.",
"apdo.", "apdo.",
"Av.", "Av.",
"Avda.", "Avda.",
"b.",
"c.",
"Cía.", "Cía.",
"d.",
"e.",
"etc.", "etc.",
"f.",
"g.",
"Gob.", "Gob.",
"Gral.", "Gral.",
"h.",
"i.",
"Ing.", "Ing.",
"j.",
"J.C.", "J.C.",
"k.",
"l.",
"Lic.", "Lic.",
"m.",
"m.n.", "m.n.",
"n.",
"no.", "no.",
"núm.", "núm.",
"o.",
"p.",
"P.D.", "P.D.",
"Prof.", "Prof.",
"Profa.", "Profa.",
"q.",
"q.e.p.d." "q.e.p.d."
"r.",
"s.",
"S.A.", "S.A.",
"S.L.", "S.L.",
"s.s.s.", "s.s.s.",
"Sr.", "Sr.",
"Sra.", "Sra.",
"Srta.", "Srta."
"t.",
"u.",
"v.",
"w.",
"x.",
"y.",
"z."
] ]

View File

@ -2,13 +2,16 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .. import language_data as base from .. import language_data as base
from ..language_data import strings_to_exc from ..language_data import strings_to_exc, update_exc
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -4,21 +4,25 @@ from __future__ import unicode_literals
import six import six
from spacy.language_data import strings_to_exc, update_exc from spacy.language_data import strings_to_exc, update_exc
from .punctuations import * from .punctuation import *
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import ABBREVIATIONS from .tokenizer_exceptions import ABBREVIATIONS
from .tokenizer_exceptions import OTHER_EXC from .tokenizer_exceptions import OTHER_EXC
from .. import language_data as base from .. import language_data as base
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
TOKENIZER_INFIXES = TOKENIZER_INFIXES
# HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES
TOKENIZER_SUFFIXES = base.TOKENIZER_SUFFIXES + TOKENIZER_SUFFIXES
TOKENIZER_INFIXES = TOKENIZER_INFIXES
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]

25
spacy/hu/punctuation.py Normal file
View File

@ -0,0 +1,25 @@
# encoding: utf8
from __future__ import unicode_literals
from ..language_data.punctuation import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES
TOKENIZER_SUFFIXES = [
r'(?<=[{al})])-e'.format(al=ALPHA_LOWER)
]
TOKENIZER_INFIXES = [
r'(?<=[0-9])-(?=[0-9])',
r'(?<=[0-9])[+\-\*/^](?=[0-9])',
r'(?<=[{a}])--(?=[{a}])',
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r'(?<=[0-9{a}])"(?=[\-{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
]
TOKENIZER_INFIXES += LIST_ELLIPSES
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]

View File

@ -1,89 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals
TOKENIZER_PREFIXES = r'''
+
'''.strip().split('\n')
TOKENIZER_SUFFIXES = r'''
,
\"
\)
\]
\}
\*
\!
\?
\$
>
:
;
'
«
_
''
\.\.
\.\.\.
\.\.\.\.
(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”+-])\.
(?<=[a-züóőúéáűí)])-e
\-\-
´
(?<=[0-9])\+
(?<=[a-z0-9üóőúéáűí][\)\]"'%\)§/])\.
(?<=[0-9])km²
(?<=[0-9])
(?<=[0-9])cm²
(?<=[0-9])mm²
(?<=[0-9])km³
(?<=[0-9])
(?<=[0-9])cm³
(?<=[0-9])mm³
(?<=[0-9])ha
(?<=[0-9])km
(?<=[0-9])m
(?<=[0-9])cm
(?<=[0-9])mm
(?<=[0-9])µm
(?<=[0-9])nm
(?<=[0-9])yd
(?<=[0-9])in
(?<=[0-9])ft
(?<=[0-9])kg
(?<=[0-9])g
(?<=[0-9])mg
(?<=[0-9])µg
(?<=[0-9])t
(?<=[0-9])lb
(?<=[0-9])oz
(?<=[0-9])m/s
(?<=[0-9])km/h
(?<=[0-9])mph
(?<=°[FCK])\.
(?<=[0-9])hPa
(?<=[0-9])Pa
(?<=[0-9])mbar
(?<=[0-9])mb
(?<=[0-9])T
(?<=[0-9])G
(?<=[0-9])M
(?<=[0-9])K
(?<=[0-9])kb
'''.strip().split('\n')
TOKENIZER_INFIXES = r'''
\.\.+
(?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ])
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
(?<=[0-9])[+\-\*/^](?=[0-9])
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
'''.strip().split('\n')
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]

View File

@ -111,7 +111,6 @@ Vcs.
Vhr. Vhr.
X.Y. X.Y.
Zs. Zs.
a.
a.C. a.C.
ac. ac.
adj. adj.
@ -126,7 +125,6 @@ ang.
arch. arch.
at. at.
aug. aug.
b.
b.a. b.a.
b.s. b.s.
b.sc. b.sc.
@ -141,7 +139,6 @@ br.
bsc. bsc.
bt. bt.
btk. btk.
c.
ca. ca.
cc. cc.
cca. cca.
@ -155,7 +152,6 @@ csc.
csüt. csüt.
cső. cső.
ctv. ctv.
d.
dbj. dbj.
dd. dd.
ddr. ddr.
@ -170,7 +166,6 @@ dolg.
dr. dr.
du. du.
dzs. dzs.
e.
ea. ea.
ed. ed.
eff. eff.
@ -186,7 +181,6 @@ etc.
ev. ev.
ezr. ezr.
. .
f.
f.h. f.h.
f.é. f.é.
fam. fam.
@ -213,7 +207,6 @@ főig.
főisk. főisk.
főtörm. főtörm.
főv. főv.
g.
gazd. gazd.
gimn. gimn.
gk. gk.
@ -225,7 +218,6 @@ gy.
gyak. gyak.
gyártm. gyártm.
gör. gör.
h.
hads. hads.
hallg. hallg.
hdm. hdm.
@ -266,7 +258,6 @@ isk.
ism. ism.
izr. izr.
. .
j.
jan. jan.
jav. jav.
jegyz. jegyz.
@ -278,7 +269,6 @@ jr.
jvb. jvb.
júl. júl.
jún. jún.
k.
karb. karb.
kat. kat.
kb. kb.
@ -313,7 +303,6 @@ közl.
közp. közp.
közt. közt.
. .
l.
lat. lat.
ld. ld.
legs. legs.
@ -324,7 +313,6 @@ lt.
ltd. ltd.
ltp. ltp.
luth. luth.
m.
m.a. m.a.
m.s. m.s.
m.sc. m.sc.
@ -359,7 +347,6 @@ műh.
műsz. műsz.
műv. műv.
művez. művez.
n.
nagyker. nagyker.
nagys. nagys.
nat. nat.
@ -372,7 +359,6 @@ ny.
nyilv. nyilv.
nyrt. nyrt.
nyug. nyug.
o.
obj. obj.
okl. okl.
okt. okt.
@ -381,7 +367,6 @@ orsz.
ort. ort.
ov. ov.
ovh. ovh.
p.
pf. pf.
pg. pg.
ph.d ph.d
@ -404,8 +389,6 @@ pság.
ptk. ptk.
pu. pu.
. .
q.
r.
r.k. r.k.
rac. rac.
rad. rad.
@ -420,7 +403,6 @@ rkt.
rt. rt.
rtg. rtg.
röv. röv.
s.
s.b. s.b.
s.k. s.k.
sa. sa.
@ -450,7 +432,6 @@ szt.
szubj. szubj.
szöv. szöv.
szül. szül.
t.
tanm. tanm.
tb. tb.
tbk. tbk.
@ -476,13 +457,11 @@ tvr.
ty. ty.
törv. törv.
. .
u.
ua. ua.
ui. ui.
unit. unit.
uo. uo.
uv. uv.
v.
vas. vas.
vb. vb.
vegy. vegy.
@ -501,9 +480,6 @@ vv.
vál. vál.
vízv. vízv.
. .
w.
y.
z.
zrt. zrt.
zs. zs.
Ész. Ész.
@ -520,7 +496,6 @@ zs.
évf. évf.
í. í.
ó. ó.
ö.
össz. össz.
ötk. ötk.
özv. özv.
@ -528,7 +503,6 @@ zs.
úm. úm.
ún. ún.
út. út.
ü.
üag. üag.
üd. üd.
üdv. üdv.
@ -544,6 +518,5 @@ zs.
""".strip().split() """.strip().split()
OTHER_EXC = """ OTHER_EXC = """
''
-e -e
""".strip().split() """.strip().split()

View File

@ -1,8 +1,6 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from os import path
from ..language import Language from ..language import Language
from ..attrs import LANG from ..attrs import LANG

View File

@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -1,3 +1,4 @@
from .abbreviations import *
from .emoticons import * from .emoticons import *
from .punctuation import * from .punctuation import *
from .tag_map import * from .tag_map import *

View File

@ -0,0 +1,43 @@
# encoding: utf8
from __future__ import unicode_literals
ABBREVIATIONS = [
"'",
"\\\")",
"<space>",
"''",
"C++",
"a.",
"b.",
"c.",
"d.",
"e.",
"f.",
"g.",
"h.",
"i.",
"j.",
"k.",
"l.",
"m.",
"n.",
"o.",
"p.",
"q.",
"r.",
"s.",
"t.",
"u.",
"v.",
"w.",
"x.",
"y.",
"z.",
"ä.",
"ö.",
"ü."
]
__all__ = [ "ABBREVIATIONS" ]

View File

@ -13,6 +13,7 @@ EMOTICONS = set("""
(-: (-:
=) =)
(= (=
")
:] :]
:-] :-]
[: [:

View File

@ -1,133 +1,115 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
TOKENIZER_PREFIXES = r'''
, _ALPHA_LOWER = """
" a ä à á â ǎ æ ã å ā ă ą b c ç ć č ĉ ċ c̄ d ð ď e é è ê ë ė ȅ ȩ ę f g ĝ ğ h i ı
( î ï í ī ì ȉ ǐ į ĩ j k ķ l ł ļ m n ñ ń ň ņ o ö ó ò ő ô õ œ ø ō ő ǒ ơ p q r ř ŗ s
ß ś š ş ŝ t ť u ú û ù ú ū ű ǔ ů ų ư v w ŵ x y ÿ ý ŷ z ź ž ż þ
"""
_ALPHA_UPPER = """
A Ä À Á Â Ǎ Æ Ã Å Ā Ă Ą B C Ç Ć Č Ĉ Ċ C̄ D Ð Ď E É È Ê Ë Ė Ȅ Ȩ Ę F G Ĝ Ğ H I İ
Î Ï Í Ī Ì Ȉ Ǐ Į Ĩ J K Ķ L Ł Ļ M N Ñ Ń Ň Ņ O Ö Ó Ò Ő Ô Õ Œ Ø Ō Ő Ǒ Ơ P Q R Ř Ŗ S
Ś Š Ş Ŝ T Ť U Ú Û Ù Ú Ū Ű Ǔ Ů Ų Ư V W Ŵ X Y Ÿ Ý Ŷ Z Ź Ž Ż Þ
"""
_UNITS = """
km km² km³ m dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft kg g mg
µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb
TB T G M K
"""
_CURRENCY = r"""
\$ £ ¥ ฿ US\$ C\$ A\$
"""
_QUOTES = r"""
' '' " ” “ `` ` ´ , „ » «
"""
_PUNCT = r"""
, : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &
"""
_HYPHENS = r"""
- -- ---
"""
LIST_ELLIPSES = [
r'\.\.+',
""
]
LIST_CURRENCY = list(_CURRENCY.strip().split())
LIST_QUOTES = list(_QUOTES.strip().split())
LIST_PUNCT = list(_PUNCT.strip().split())
LIST_HYPHENS = list(_HYPHENS.strip().split())
ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '')
ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '')
ALPHA = ALPHA_LOWER + ALPHA_UPPER
QUOTES = _QUOTES.strip().replace(' ', '|')
CURRENCY = _CURRENCY.strip().replace(' ', '|')
UNITS = _UNITS.strip().replace(' ', '|')
HYPHENS = _HYPHENS.strip().replace(' ', '|')
# Prefixes
TOKENIZER_PREFIXES = (
['§', '%', r'\+'] +
LIST_PUNCT +
LIST_ELLIPSES +
LIST_QUOTES +
LIST_CURRENCY
)
# Suffixes
TOKENIZER_SUFFIXES = (
LIST_PUNCT +
LIST_ELLIPSES +
LIST_QUOTES +
[ [
{ r'(?<=[0-9])\+',
* r'(?<=°[FfCcKk])\.',
< r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
> r'(?<=[0-9])(?:{u})'.format(u=UNITS),
$ r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES),
£ r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER),
¡ "'s", "'S", "s", "S"
¿ ]
)
'
``
`
#
....
...
»
§
US$
C$
A$
a-
'''.strip().split('\n')
TOKENIZER_SUFFIXES = r''' # Infixes
,
\"
\)
\]
\}
\*
\!
\?
%
\$
>
:
;
'
«
_
''
's
'S
s
S
°
\.\.
\.\.\.
\.\.\.\.
(?<=[a-z0-9)\]"'%\)])\.
(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
\-\-
´
(?<=[0-9])km²
(?<=[0-9])
(?<=[0-9])cm²
(?<=[0-9])mm²
(?<=[0-9])km³
(?<=[0-9])
(?<=[0-9])cm³
(?<=[0-9])mm³
(?<=[0-9])ha
(?<=[0-9])km
(?<=[0-9])m
(?<=[0-9])cm
(?<=[0-9])mm
(?<=[0-9])µm
(?<=[0-9])nm
(?<=[0-9])yd
(?<=[0-9])in
(?<=[0-9])ft
(?<=[0-9])kg
(?<=[0-9])g
(?<=[0-9])mg
(?<=[0-9])µg
(?<=[0-9])t
(?<=[0-9])lb
(?<=[0-9])oz
(?<=[0-9])m/s
(?<=[0-9])km/h
(?<=[0-9])mph
(?<=[0-9])°C
(?<=[0-9])°K
(?<=[0-9])°F
(?<=[0-9])hPa
(?<=[0-9])Pa
(?<=[0-9])mbar
(?<=[0-9])mb
(?<=[0-9])T
(?<=[0-9])G
(?<=[0-9])M
(?<=[0-9])K
(?<=[0-9])kb
'''.strip().split('\n')
TOKENIZER_INFIXES = (
TOKENIZER_INFIXES = r''' LIST_ELLIPSES +
[
\.\.\.+ r'(?<=[0-9])[+\-\*/^](?=[0-9])',
(?<=[a-z])\.(?=[A-Z]) r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
(?<=[a-z])\.(?=[A-Z]) r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
(?<=[a-zA-Z])-(?=[a-zA-z]) r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
(?<=[a-zA-Z])--(?=[a-zA-z]) r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
(?<=[0-9])-(?=[0-9]) ]
(?<=[A-Za-z]),(?=[A-Za-z]) )
(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
'''.strip().split('\n')
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] __all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]

View File

@ -20,5 +20,6 @@ TAG_MAP = {
"X": {POS: X}, "X": {POS: X},
"CONJ": {POS: CONJ}, "CONJ": {POS: CONJ},
"ADJ": {POS: ADJ}, "ADJ": {POS: ADJ},
"VERB": {POS: VERB} "VERB": {POS: VERB},
"PART": {POS: PART}
} }

View File

@ -1,8 +1,6 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from os import path
from ..language import Language from ..language import Language
from ..attrs import LANG from ..attrs import LANG
from .language_data import * from .language_data import *

View File

@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -1,8 +1,6 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from os import path
from ..language import Language from ..language import Language
from ..attrs import LANG from ..attrs import LANG

View File

@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -1,8 +1,6 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from os import path
from ..language import Language from ..language import Language
from ..attrs import LANG from ..attrs import LANG
from .language_data import * from .language_data import *

View File

@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -0,0 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from ...de import German
@pytest.fixture
def de_tokenizer():
return German.Defaults.create_tokenizer()

View File

View File

@ -0,0 +1,27 @@
# coding: utf-8
"""Test that tokenizer exceptions and emoticons are handles correctly."""
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
def test_tokenizer_splits_contractions(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
def test_tokenizer_handles_abbr(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 1
def test_tokenizer_handles_exc_in_text(de_tokenizer):
text = "Ich bin z.Zt. im Urlaub."
tokens = de_tokenizer(text)
assert len(tokens) == 6
assert tokens[2].text == "z.Zt."
assert tokens[2].lemma_ == "zur Zeit"

View File

@ -0,0 +1,116 @@
# coding: utf-8
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["(unter)"])
def test_tokenizer_splits_no_special(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["unter'm"])
def test_tokenizer_splits_no_punct(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(unter'm"])
def test_tokenizer_splits_prefix_punct(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["unter'm)"])
def test_tokenizer_splits_suffix_punct(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(unter'm)"])
def test_tokenizer_splits_even_wrap(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text', ["(unter'm?)"])
def test_tokenizer_splits_uneven_wrap(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 5
@pytest.mark.parametrize('text,length', [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
def test_tokenizer_splits_prefix_interact(de_tokenizer, text, length):
tokens = de_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text', ["z.B.)"])
def test_tokenizer_splits_suffix_interact(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(z.B.)"])
def test_tokenizer_splits_even_wrap_interact(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(z.B.?)"])
def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text', ["blau-rot"])
def test_tokenizer_splits_hyphens(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
def test_tokenizer_splits_numeric_range(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["blau.Rot", "Hallo.Welt"])
def test_tokenizer_splits_period_infix(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Hallo,Welt", "eins,zwei"])
def test_tokenizer_splits_comma_infix(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
assert tokens[0].text == text.split(",")[0]
assert tokens[1].text == ","
assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["blau...Rot", "blau...rot"])
def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
def test_tokenizer_splits_double_hyphen_infix(de_tokenizer):
tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.")
assert len(tokens) == 12
assert tokens[0].text == "Viele"
assert tokens[1].text == "Regeln"
assert tokens[2].text == "--"
assert tokens[3].text == "wie"
assert tokens[4].text == "die"
assert tokens[5].text == "Bindestrich"
assert tokens[6].text == "-"
assert tokens[7].text == "Regeln"
assert tokens[8].text == "--"
assert tokens[9].text == "sind"
assert tokens[10].text == "kompliziert"

View File

@ -0,0 +1,45 @@
# coding: utf-8
"""Test that longer and mixed texts are tokenized correctly."""
from __future__ import unicode_literals
import pytest
def test_tokenizer_handles_long_text(de_tokenizer):
text = """Die Verwandlung
Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte, fand er sich in
seinem Bett zu einem ungeheueren Ungeziefer verwandelt.
Er lag auf seinem panzerartig harten Rücken und sah, wenn er den Kopf ein wenig
hob, seinen gewölbten, braunen, von bogenförmigen Versteifungen geteilten
Bauch, auf dessen Höhe sich die Bettdecke, zum gänzlichen Niedergleiten bereit,
kaum noch erhalten konnte. Seine vielen, im Vergleich zu seinem sonstigen
Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
»Was ist mit mir geschehen?«, dachte er."""
tokens = de_tokenizer(text)
assert len(tokens) == 109
@pytest.mark.parametrize('text,length', [
("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1),
("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1),
("Kraftfahrzeug-Haftpflichtversicherung", 3),
("Vakuum-Mittelfrequenz-Induktionsofen", 5)
])
def test_tokenizer_handles_long_words(de_tokenizer, text, length):
tokens = de_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text,length', [
("»Was ist mit mir geschehen?«, dachte er.", 12),
("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15)
])
def test_tokenizer_handles_examples(de_tokenizer, text, length):
tokens = de_tokenizer(text)
assert len(tokens) == length

View File

View File

@ -0,0 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from ...en import English
@pytest.fixture
def en_tokenizer():
return English.Defaults.create_tokenizer()

View File

View File

@ -0,0 +1,87 @@
# coding: utf-8
"""Test that tokens are created correctly for contractions."""
from __future__ import unicode_literals
import pytest
def test_tokenizer_handles_basic_contraction(en_tokenizer):
text = "don't giggle"
tokens = en_tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == "n't"
text = "i said don't!"
tokens = en_tokenizer(text)
assert len(tokens) == 5
assert tokens[4].text == "!"
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
tokens = en_tokenizer(text_poss)
assert len(tokens) == 2
assert tokens[0].text == text
assert tokens[1].text == "'s"
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'"
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].text == text
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'ll"
assert tokens[1].lemma_ == "will"
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
tokens_lower = en_tokenizer(text_lower)
tokens_title = en_tokenizer(text_title)
assert tokens_title[0].text == tokens_lower[0].text.title()
assert tokens_lower[0].text == tokens_title[0].text.lower()
assert tokens_lower[1].text == tokens_title[1].text
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
tokens = en_tokenizer(pron + contraction)
assert tokens[0].text == pron
assert tokens[1].text == contraction
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
tokens = en_tokenizer(exc)
assert len(tokens) == 1
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
tokens = en_tokenizer(wo_punct)
assert len(tokens) == 2
tokens = en_tokenizer(w_punct)
assert len(tokens) == 3

View File

@ -0,0 +1,20 @@
# coding: utf-8
"""Test that tokenizer exceptions are handled correctly."""
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
def test_tokenizer_handles_abbr(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
def test_tokenizer_handles_exc_in_text(en_tokenizer):
text = "It's mediocre i.e. bad."
tokens = en_tokenizer(text)
assert len(tokens) == 6
assert tokens[3].text == "i.e."

View File

@ -1,12 +1,14 @@
# coding: utf-8
"""Test that token.idx correctly computes index into the original string.""" """Test that token.idx correctly computes index into the original string."""
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
def test_simple_punct(en_tokenizer): def test_simple_punct(en_tokenizer):
text = 'to walk, do foo' text = "to walk, do foo"
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
assert tokens[0].idx == 0 assert tokens[0].idx == 0
assert tokens[1].idx == 3 assert tokens[1].idx == 3
@ -16,7 +18,7 @@ def test_simple_punct(en_tokenizer):
def test_complex_punct(en_tokenizer): def test_complex_punct(en_tokenizer):
text = 'Tom (D., Ill.)!' text = "Tom (D., Ill.)!"
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
assert tokens[0].idx == 0 assert tokens[0].idx == 0
assert len(tokens[0]) == 3 assert len(tokens[0]) == 3

View File

@ -0,0 +1,136 @@
# coding: utf-8
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["(can)"])
def test_tokenizer_splits_no_special(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["can't"])
def test_tokenizer_splits_no_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(can't"])
def test_tokenizer_splits_prefix_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["can't)"])
def test_tokenizer_splits_suffix_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(can't)"])
def test_tokenizer_splits_even_wrap(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text', ["(can't?)"])
def test_tokenizer_splits_uneven_wrap(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 5
@pytest.mark.parametrize('text,length', [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
def test_tokenizer_splits_prefix_interact(en_tokenizer, text, length):
tokens = en_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text', ["U.S.)"])
def test_tokenizer_splits_suffix_interact(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(U.S.)"])
def test_tokenizer_splits_even_wrap_interact(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(U.S.?)"])
def test_tokenizer_splits_uneven_wrap_interact(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text', ["best-known"])
def test_tokenizer_splits_hyphens(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
def test_tokenizer_splits_numeric_range(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["best.Known", "Hello.World"])
def test_tokenizer_splits_period_infix(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Hello,world", "one,two"])
def test_tokenizer_splits_comma_infix(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
assert tokens[0].text == text.split(",")[0]
assert tokens[1].text == ","
assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["best...Known", "best...known"])
def test_tokenizer_splits_ellipsis_infix(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
def test_tokenizer_splits_double_hyphen_infix(en_tokenizer):
tokens = en_tokenizer("No decent--let alone well-bred--people.")
assert tokens[0].text == "No"
assert tokens[1].text == "decent"
assert tokens[2].text == "--"
assert tokens[3].text == "let"
assert tokens[4].text == "alone"
assert tokens[5].text == "well"
assert tokens[6].text == "-"
assert tokens[7].text == "bred"
assert tokens[8].text == "--"
assert tokens[9].text == "people"
@pytest.mark.xfail
def test_tokenizer_splits_period_abbr(en_tokenizer):
text = "Today is Tuesday.Mr."
tokens = en_tokenizer(text)
assert len(tokens) == 5
assert tokens[0].text == "Today"
assert tokens[1].text == "is"
assert tokens[2].text == "Tuesday"
assert tokens[3].text == "."
assert tokens[4].text == "Mr."
@pytest.mark.xfail
def test_tokenizer_splits_em_dash_infix(en_tokenizer):
# Re Issue #225
tokens = en_tokenizer("""Will this road take me to Puddleton?\u2014No, """
"""you'll have to walk there.\u2014Ariel.""")
assert tokens[6].text == "Puddleton"
assert tokens[7].text == "?"
assert tokens[8].text == "\u2014"

View File

@ -0,0 +1,132 @@
# coding: utf-8
"""Test that open, closed and paired punctuation is split off correctly."""
from __future__ import unicode_literals
import pytest
from ....util import compile_prefix_regex
from ....language_data import TOKENIZER_PREFIXES
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
PUNCT_OPEN = ['(', '[', '{', '*']
PUNCT_CLOSE = [')', ']', '}', '*']
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
@pytest.mark.parametrize('text', ["(", "((", "<"])
def test_tokenizer_handles_only_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == len(text)
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_open_punct(en_tokenizer, punct, text):
tokens = en_tokenizer(punct + text)
assert len(tokens) == 2
assert tokens[0].text == punct
assert tokens[1].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_close_punct(en_tokenizer, punct, text):
tokens = en_tokenizer(text + punct)
assert len(tokens) == 2
assert tokens[0].text == text
assert tokens[1].text == punct
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('punct_add', ["`"])
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
tokens = en_tokenizer(punct + punct_add + text)
assert len(tokens) == 3
assert tokens[0].text == punct
assert tokens[1].text == punct_add
assert tokens[2].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('punct_add', ["'"])
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
tokens = en_tokenizer(text + punct + punct_add)
assert len(tokens) == 3
assert tokens[0].text == text
assert tokens[1].text == punct
assert tokens[2].text == punct_add
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
tokens = en_tokenizer(punct + punct + punct + text)
assert len(tokens) == 4
assert tokens[0].text == punct
assert tokens[3].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
tokens = en_tokenizer(text + punct + punct + punct)
assert len(tokens) == 4
assert tokens[0].text == text
assert tokens[1].text == punct
@pytest.mark.parametrize('text', ["'The"])
def test_tokenizer_splits_open_appostrophe(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == "'"
@pytest.mark.parametrize('text', ["Hello''"])
def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
tokens_punct = en_tokenizer("''")
assert len(tokens_punct) == 1
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text):
tokens = en_tokenizer(punct_open + text + punct_close)
assert len(tokens) == 3
assert tokens[0].text == punct_open
assert tokens[1].text == text
assert tokens[2].text == punct_close
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")])
@pytest.mark.parametrize('text', ["Hello"])
def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text):
tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add)
assert len(tokens) == 5
assert tokens[0].text == punct_open_add
assert tokens[1].text == punct_open
assert tokens[2].text == text
assert tokens[3].text == punct_close
assert tokens[4].text == punct_close_add
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
def test_tokenizer_splits_pre_punct_regex(text, punct):
match = en_search_prefixes(text)
assert match.group() == punct
def test_tokenizer_splits_bracket_period(en_tokenizer):
text = "(And a 6a.m. run through Washington Park)."
tokens = en_tokenizer(text)
assert tokens[len(tokens) - 1].text == "."

View File

@ -0,0 +1,36 @@
# coding: utf-8
"""Test that longer and mixed texts are tokenized correctly."""
from __future__ import unicode_literals
import pytest
def test_tokenizer_handles_long_text(en_tokenizer):
text = """Tributes pour in for late British Labour Party leader
Tributes poured in from around the world Thursday
to the late Labour Party leader John Smith, who died earlier from a massive
heart attack aged 55.
In Washington, the US State Department issued a statement regretting "the
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
"Mr. Smith, throughout his distinguished"""
tokens = en_tokenizer(text)
assert len(tokens) == 76
@pytest.mark.parametrize('text,length', [
("The U.S. Army likes Shock and Awe.", 8),
("U.N. regulations are not a part of their concern.", 10),
("“Isn't it?”", 6),
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
("""'Me too!', Mr. P. Delaware cried. """, 11),
("They ran about 10km.", 6),
# ("But then the 6,000-year ice age came...", 10)
])
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
tokens = en_tokenizer(text)
assert len(tokens) == length

View File

@ -0,0 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from ...hu import Hungarian
@pytest.fixture
def hu_tokenizer():
return Hungarian.Defaults.create_tokenizer()

View File

@ -2,9 +2,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from spacy.hu import Hungarian
_DEFAULT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
DEFAULT_TESTS = [
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
@ -18,9 +19,10 @@ _DEFAULT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági',
('Valami ...van...', ['Valami', '...', 'van', '...']), ('Valami ...van...', ['Valami', '...', 'van', '...']),
('Valami...', ['Valami', '...']), ('Valami...', ['Valami', '...']),
('Valami ...', ['Valami', '...']), ('Valami ...', ['Valami', '...']),
('Valami ... más.', ['Valami', '...', 'más', '.'])] ('Valami ... más.', ['Valami', '...', 'más', '.'])
]
_HYPHEN_TESTS = [ HYPHEN_TESTS = [
('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']), ('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']),
('Egy -nak.', ['Egy', '-nak', '.']), ('Egy -nak.', ['Egy', '-nak', '.']),
('Egy bel-.', ['Egy', 'bel-', '.']), ('Egy bel-.', ['Egy', 'bel-', '.']),
@ -39,9 +41,11 @@ _HYPHEN_TESTS = [
('A 7-es.', ['A', '7-es', '.']), ('A 7-es.', ['A', '7-es', '.']),
('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']), ('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']),
('A %-sal.', ['A', '%-sal', '.']), ('A %-sal.', ['A', '%-sal', '.']),
('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])] ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])
]
_NUMBER_TESTS = [('A 2b van.', ['A', '2b', 'van', '.']), NUMBER_TESTS = [
('A 2b van.', ['A', '2b', 'van', '.']),
('A 2b-ben van.', ['A', '2b-ben', 'van', '.']), ('A 2b-ben van.', ['A', '2b-ben', 'van', '.']),
('A 2b.', ['A', '2b', '.']), ('A 2b.', ['A', '2b', '.']),
('A 2b-ben.', ['A', '2b-ben', '.']), ('A 2b-ben.', ['A', '2b-ben', '.']),
@ -190,15 +194,19 @@ _NUMBER_TESTS = [('A 2b van.', ['A', '2b', 'van', '.']),
('A III/c-ben.', ['A', 'III/c-ben', '.']), ('A III/c-ben.', ['A', 'III/c-ben', '.']),
('A TU154 van.', ['A', 'TU154', 'van', '.']), ('A TU154 van.', ['A', 'TU154', 'van', '.']),
('A TU154-ben van.', ['A', 'TU154-ben', 'van', '.']), ('A TU154-ben van.', ['A', 'TU154-ben', 'van', '.']),
('A TU154-ben.', ['A', 'TU154-ben', '.'])] ('A TU154-ben.', ['A', 'TU154-ben', '.'])
]
_QUOTE_TESTS = [('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), QUOTE_TESTS = [
('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']), ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']),
('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']), ('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']),
("A don't van.", ['A', "don't", 'van', '.'])] ("A don't van.", ['A', "don't", 'van', '.'])
]
_DOT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), DOT_TESTS = [
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
@ -212,22 +220,15 @@ _DOT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'sz
('Valami ...van...', ['Valami', '...', 'van', '...']), ('Valami ...van...', ['Valami', '...', 'van', '...']),
('Valami...', ['Valami', '...']), ('Valami...', ['Valami', '...']),
('Valami ...', ['Valami', '...']), ('Valami ...', ['Valami', '...']),
('Valami ... más.', ['Valami', '...', 'más', '.'])] ('Valami ... más.', ['Valami', '...', 'más', '.'])
]
@pytest.fixture(scope="session") TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS # + NUMBER_TESTS + HYPHEN_TESTS
def HU():
return Hungarian()
@pytest.fixture(scope="module") @pytest.mark.parametrize('text,expected_tokens', TESTCASES)
def hu_tokenizer(HU): def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):
return HU.tokenizer tokens = hu_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
@pytest.mark.parametrize(("input", "expected_tokens"),
_DEFAULT_TESTS + _HYPHEN_TESTS + _NUMBER_TESTS + _DOT_TESTS + _QUOTE_TESTS)
def test_testcases(hu_tokenizer, input, expected_tokens):
tokens = hu_tokenizer(input)
token_list = [token.orth_ for token in tokens if not token.is_space]
assert expected_tokens == token_list assert expected_tokens == token_list

View File

@ -0,0 +1,16 @@
from __future__ import unicode_literals
from ...en import English
import pytest
@pytest.fixture
def en_tokenizer():
return English.Defaults.create_tokenizer()
def test_issue351(en_tokenizer):
doc = en_tokenizer(" This is a cat.")
assert doc[0].idx == 0
assert len(doc[0]) == 3
assert doc[1].idx == 3

View File

@ -0,0 +1,14 @@
from __future__ import unicode_literals
from ...en import English
import pytest
@pytest.fixture
def en_tokenizer():
return English.Defaults.create_tokenizer()
def test_big_ellipsis(en_tokenizer):
tokens = en_tokenizer(u'$45...............Asking')
assert len(tokens) > 2

View File

@ -1,4 +0,0 @@
The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ]
The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ]
Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of 26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of 1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ]

View File

@ -1,7 +1,23 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest
from spacy.en import English
from ...en import English
from ...de import German
from ...es import Spanish
from ...it import Italian
from ...fr import French
from ...pt import Portuguese
from ...nl import Dutch
from ...sv import Swedish
from ...hu import Hungarian
@pytest.fixture(scope="module") LANGUAGES = [English, German, Spanish, Italian, French, Dutch, Swedish, Hungarian]
def en_tokenizer(EN):
return EN.tokenizer
@pytest.fixture(params=LANGUAGES)
def tokenizer(request):
lang = request.param
return lang.Defaults.create_tokenizer()

View File

@ -1,58 +0,0 @@
from __future__ import unicode_literals
import pytest
def test_possess(en_tokenizer):
tokens = en_tokenizer("Mike's")
assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike"
assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s"
assert len(tokens) == 2
def test_apostrophe(en_tokenizer):
tokens = en_tokenizer("schools'")
assert len(tokens) == 2
assert tokens[1].orth_ == "'"
assert tokens[0].orth_ == "schools"
def test_LL(en_tokenizer):
tokens = en_tokenizer("we'll")
assert len(tokens) == 2
assert tokens[1].orth_ == "'ll"
assert tokens[1].lemma_ == "will"
assert tokens[0].orth_ == "we"
def test_aint(en_tokenizer):
tokens = en_tokenizer("ain't")
assert len(tokens) == 2
assert tokens[0].orth_ == "ai"
assert tokens[0].lemma_ == "be"
assert tokens[1].orth_ == "n't"
assert tokens[1].lemma_ == "not"
def test_capitalized(en_tokenizer):
tokens = en_tokenizer("can't")
assert len(tokens) == 2
tokens = en_tokenizer("Can't")
assert len(tokens) == 2
tokens = en_tokenizer("Ain't")
assert len(tokens) == 2
assert tokens[0].orth_ == "Ai"
assert tokens[0].lemma_ == "be"
def test_punct(en_tokenizer):
tokens = en_tokenizer("We've")
assert len(tokens) == 2
tokens = en_tokenizer("``We've")
assert len(tokens) == 3
@pytest.mark.xfail
def test_therell(en_tokenizer):
tokens = en_tokenizer("there'll")
assert len(tokens) == 2
assert tokens[0].text == "there"
assert tokens[1].text == "there"

View File

@ -1,35 +0,0 @@
from __future__ import unicode_literals
import pytest
def test_tweebo_challenge(en_tokenizer):
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
tokens = en_tokenizer(text)
assert tokens[0].orth_ == ":o"
assert tokens[1].orth_ == ":/"
assert tokens[2].orth_ == ":'("
assert tokens[3].orth_ == ">:o"
assert tokens[4].orth_ == "(:"
assert tokens[5].orth_ == ":)"
assert tokens[6].orth_ == ">.<"
assert tokens[7].orth_ == "XD"
assert tokens[8].orth_ == "-__-"
assert tokens[9].orth_ == "o.O"
assert tokens[10].orth_ == ";D"
assert tokens[11].orth_ == ":-)"
assert tokens[12].orth_ == "@_@"
assert tokens[13].orth_ == ":P"
assert tokens[14].orth_ == "8D"
assert tokens[15].orth_ == ":1"
assert tokens[16].orth_ == ">:("
assert tokens[17].orth_ == ":D"
assert tokens[18].orth_ == "=|"
assert tokens[19].orth_ == '")'
assert tokens[20].orth_ == ':>'
assert tokens[21].orth_ == '....'
def test_false_positive(en_tokenizer):
text = "example:)"
tokens = en_tokenizer(text)
assert len(tokens) == 3

View File

@ -0,0 +1,41 @@
# coding: utf-8
"""Test that tokenizer exceptions and emoticons are handled correctly."""
from __future__ import unicode_literals
import pytest
def test_tokenizer_handles_emoticons(tokenizer):
# Tweebo challenge (CMU)
text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
tokens = tokenizer(text)
assert tokens[0].text == ":o"
assert tokens[1].text == ":/"
assert tokens[2].text == ":'("
assert tokens[3].text == ">:o"
assert tokens[4].text == "(:"
assert tokens[5].text == ":)"
assert tokens[6].text == ">.<"
assert tokens[7].text == "XD"
assert tokens[8].text == "-__-"
assert tokens[9].text == "o.O"
assert tokens[10].text == ";D"
assert tokens[11].text == ":-)"
assert tokens[12].text == "@_@"
assert tokens[13].text == ":P"
assert tokens[14].text == "8D"
assert tokens[15].text == ":1"
assert tokens[16].text == ">:("
assert tokens[17].text == ":D"
assert tokens[18].text == "=|"
assert tokens[19].text == '")'
assert tokens[20].text == ':>'
assert tokens[21].text == '....'
@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
tokens = tokenizer(text)
assert len(tokens) == length

View File

@ -1,62 +0,0 @@
from __future__ import unicode_literals
import pytest
def test_hyphen(en_tokenizer):
tokens = en_tokenizer('best-known')
assert len(tokens) == 3
def test_numeric_range(en_tokenizer):
tokens = en_tokenizer('0.1-13.5')
assert len(tokens) == 3
def test_period(en_tokenizer):
tokens = en_tokenizer('best.Known')
assert len(tokens) == 3
tokens = en_tokenizer('zombo.com')
assert len(tokens) == 1
def test_ellipsis(en_tokenizer):
tokens = en_tokenizer('best...Known')
assert len(tokens) == 3
tokens = en_tokenizer('best...known')
assert len(tokens) == 3
def test_big_ellipsis(en_tokenizer):
'''Test regression identified in Issue #360'''
tokens = en_tokenizer(u'$45...............Asking')
assert len(tokens) > 2
def test_email(en_tokenizer):
tokens = en_tokenizer('hello@example.com')
assert len(tokens) == 1
tokens = en_tokenizer('hi+there@gmail.it')
assert len(tokens) == 1
def test_double_hyphen(en_tokenizer):
tokens = en_tokenizer(u'No decent--let alone well-bred--people.')
assert tokens[0].text == u'No'
assert tokens[1].text == u'decent'
assert tokens[2].text == u'--'
assert tokens[3].text == u'let'
assert tokens[4].text == u'alone'
assert tokens[5].text == u'well'
assert tokens[6].text == u'-'
# TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter
# on infixes.
assert tokens[7].text == u'bred'
assert tokens[8].text == u'--'
assert tokens[9].text == u'people'
def test_infix_comma(en_tokenizer):
# Re issue #326
tokens = en_tokenizer(u'Hello,world')
assert tokens[0].text == u'Hello'
assert tokens[1].text == u','
assert tokens[2].text == u'world'

View File

@ -1,9 +0,0 @@
from __future__ import unicode_literals
def test_only_pre1(en_tokenizer):
assert len(en_tokenizer("(")) == 1
def test_only_pre2(en_tokenizer):
assert len(en_tokenizer("((")) == 2

View File

@ -1,43 +0,0 @@
from __future__ import unicode_literals
import pytest
@pytest.fixture
def close_puncts():
return [')', ']', '}', '*']
def test_close(close_puncts, en_tokenizer):
word_str = 'Hello'
for p in close_puncts:
string = word_str + p
tokens = en_tokenizer(string)
assert len(tokens) == 2
assert tokens[1].string == p
assert tokens[0].string == word_str
def test_two_different_close(close_puncts, en_tokenizer):
word_str = 'Hello'
for p in close_puncts:
string = word_str + p + "'"
tokens = en_tokenizer(string)
assert len(tokens) == 3
assert tokens[0].string == word_str
assert tokens[1].string == p
assert tokens[2].string == "'"
def test_three_same_close(close_puncts, en_tokenizer):
word_str = 'Hello'
for p in close_puncts:
string = word_str + p + p + p
tokens = en_tokenizer(string)
assert len(tokens) == 4
assert tokens[0].string == word_str
assert tokens[1].string == p
def test_double_end_quote(en_tokenizer):
assert len(en_tokenizer("Hello''")) == 2
assert len(en_tokenizer("''")) == 1

View File

@ -1,46 +0,0 @@
from __future__ import unicode_literals
import pytest
@pytest.fixture
def open_puncts():
return ['(', '[', '{', '*']
def test_open(open_puncts, en_tokenizer):
word_str = 'Hello'
for p in open_puncts:
string = p + word_str
tokens = en_tokenizer(string)
assert len(tokens) == 2
assert tokens[0].orth_ == p
assert tokens[1].orth_ == word_str
def test_two_different_open(open_puncts, en_tokenizer):
word_str = 'Hello'
for p in open_puncts:
string = p + "`" + word_str
tokens = en_tokenizer(string)
assert len(tokens) == 3
assert tokens[0].orth_ == p
assert tokens[1].orth_ == "`"
assert tokens[2].orth_ == word_str
def test_three_same_open(open_puncts, en_tokenizer):
word_str = 'Hello'
for p in open_puncts:
string = p + p + p + word_str
tokens = en_tokenizer(string)
assert len(tokens) == 4
assert tokens[0].orth_ == p
assert tokens[3].orth_ == word_str
def test_open_appostrophe(en_tokenizer):
string = "'The"
tokens = en_tokenizer(string)
assert len(tokens) == 2
assert tokens[0].orth_ == "'"

View File

@ -1,46 +0,0 @@
"""Test entries in the tokenization special-case interacting with prefix
and suffix punctuation."""
from __future__ import unicode_literals
import pytest
def test_no_special(en_tokenizer):
assert len(en_tokenizer("(can)")) == 3
def test_no_punct(en_tokenizer):
assert len(en_tokenizer("can't")) == 2
def test_prefix(en_tokenizer):
assert len(en_tokenizer("(can't")) == 3
def test_suffix(en_tokenizer):
assert len(en_tokenizer("can't)")) == 3
def test_wrap(en_tokenizer):
assert len(en_tokenizer("(can't)")) == 4
def test_uneven_wrap(en_tokenizer):
assert len(en_tokenizer("(can't?)")) == 5
def test_prefix_interact(en_tokenizer):
assert len(en_tokenizer("U.S.")) == 1
assert len(en_tokenizer("us.")) == 2
assert len(en_tokenizer("(U.S.")) == 2
def test_suffix_interact(en_tokenizer):
assert len(en_tokenizer("U.S.)")) == 2
def test_even_wrap_interact(en_tokenizer):
assert len(en_tokenizer("(U.S.)")) == 3
def test_uneven_wrap_interact(en_tokenizer):
assert len(en_tokenizer("(U.S.?)")) == 4

View File

@ -1,9 +0,0 @@
"""Test suspected freeing of strings"""
from __future__ import unicode_literals
def test_one(en_tokenizer):
tokens = en_tokenizer('Betty Botter bought a pound of butter.')
assert tokens[0].orth_ == 'Betty'
tokens2 = en_tokenizer('Betty also bought a pound of butter.')
assert tokens2[0].orth_ == 'Betty'

View File

@ -1,32 +0,0 @@
from __future__ import unicode_literals
import pytest
@pytest.fixture
def paired_puncts():
return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
def test_token(paired_puncts, en_tokenizer):
word_str = 'Hello'
for open_, close_ in paired_puncts:
string = open_ + word_str + close_
tokens = en_tokenizer(string)
assert len(tokens) == 3
assert tokens[0].orth_ == open_
assert tokens[1].orth_ == word_str
assert tokens[2].orth_ == close_
def test_two_different(paired_puncts, en_tokenizer):
word_str = 'Hello'
for open_, close_ in paired_puncts:
string = "`" + open_ + word_str + close_ + "'"
tokens = en_tokenizer(string)
assert len(tokens) == 5
assert tokens[0].orth_ == "`"
assert tokens[1].orth_ == open_
assert tokens[2].orth_ == word_str
assert tokens[2].orth_ == word_str
assert tokens[3].orth_ == close_
assert tokens[4].orth_ == "'"

View File

@ -1,172 +1,83 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from os import path
import pytest import pytest
import io
import pickle
import cloudpickle
import tempfile
from ... import util from ...util import utf8open
from ...language_data import TOKENIZER_PREFIXES
en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
# @pytest.mark.xfail def test_tokenizer_handles_no_word(tokenizer):
# def test_pickle(en_tokenizer): tokens = tokenizer("")
# file_ = io.BytesIO()
# cloudpickle.dump(en_tokenizer, file_)
# file_.seek(0)
# loaded = pickle.load(file_)
# assert loaded is not None
def test_pre_punct_regex():
string = "(can't"
match = en_search_prefixes(string)
assert match.group() == "("
def test_no_word(en_tokenizer):
tokens = en_tokenizer(u'')
assert len(tokens) == 0 assert len(tokens) == 0
def test_single_word(en_tokenizer): @pytest.mark.parametrize('text', ["lorem"])
tokens = en_tokenizer(u'hello') def test_tokenizer_handles_single_word(tokenizer, text):
assert tokens[0].orth_ == 'hello' tokens = tokenizer(text)
assert tokens[0].text == text
def test_two_words(en_tokenizer): def test_tokenizer_handles_punct(tokenizer):
tokens = en_tokenizer('hello possums') text = "Lorem, ipsum."
assert len(tokens) == 2 tokens = tokenizer(text)
assert tokens[0].orth_ != tokens[1].orth_
def test_punct(en_tokenizer):
tokens = en_tokenizer('hello, possums.')
assert len(tokens) == 4 assert len(tokens) == 4
assert tokens[0].orth_ == 'hello' assert tokens[0].text == "Lorem"
assert tokens[1].orth_ == ',' assert tokens[1].text == ","
assert tokens[2].orth_ == 'possums' assert tokens[2].text == "ipsum"
assert tokens[1].orth_ != 'hello' assert tokens[1].text != "Lorem"
def test_digits(en_tokenizer): def test_tokenizer_handles_digits(tokenizer):
tokens = en_tokenizer('The year: 1984.') exceptions = ["hu"]
text = "Lorem ipsum: 1984."
tokens = tokenizer(text)
if tokens[0].lang_ not in exceptions:
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[0].orth == en_tokenizer.vocab['The'].orth assert tokens[0].text == "Lorem"
assert tokens[3].orth == en_tokenizer.vocab['1984'].orth assert tokens[3].text == "1984"
def test_contraction(en_tokenizer): @pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
tokens = en_tokenizer("don't giggle") def test_tokenizer_keep_urls(tokenizer, text):
assert len(tokens) == 3 tokens = tokenizer(text)
assert tokens[1].orth == en_tokenizer.vocab["n't"].orth assert len(tokens) == 1
tokens = en_tokenizer("i said don't!")
assert len(tokens) == 5
assert tokens[4].orth == en_tokenizer.vocab['!'].orth
def test_contraction_punct(en_tokenizer):
tokens = [w.text for w in en_tokenizer("(can't")]
assert tokens == ['(', 'ca', "n't"]
tokens = en_tokenizer("`ain't")
assert len(tokens) == 3
tokens = en_tokenizer('''"isn't''')
assert len(tokens) == 3
tokens = en_tokenizer("can't!")
assert len(tokens) == 3
def test_sample(en_tokenizer): @pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
text = """Tributes pour in for late British Labour Party leader def test_tokenizer_keeps_email(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 1
Tributes poured in from around the world Thursday
to the late Labour Party leader John Smith, who died earlier from a massive
heart attack aged 55.
In Washington, the US State Department issued a statement regretting "the def test_tokenizer_handles_long_text(tokenizer):
untimely death" of the rapier-tongued Scottish barrister and parliamentarian. text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit
"Mr. Smith, throughout his distinguished""" Cras egestas orci non porttitor maximus.
Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate.
tokens = en_tokenizer(text) Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris.
"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo."""
tokens = tokenizer(text)
assert len(tokens) > 5 assert len(tokens) > 5
def test_cnts1(en_tokenizer): @pytest.mark.parametrize('file_name', ["sun.txt"])
text = u"""The U.S. Army likes Shock and Awe.""" def test_tokenizer_handle_text_from_file(tokenizer, file_name):
tokens = en_tokenizer(text) loc = path.join(path.dirname(__file__), '..', file_name)
assert len(tokens) == 8 text = utf8open(loc).read()
assert len(text) != 0
tokens = tokenizer(text)
assert len(tokens) > 100
def test_cnts2(en_tokenizer): def test_tokenizer_suspected_freeing_strings(tokenizer):
text = u"""U.N. regulations are not a part of their concern.""" text1 = "Lorem dolor sit amet, consectetur adipiscing elit."
tokens = en_tokenizer(text) text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
assert len(tokens) == 10 tokens1 = tokenizer(text1)
tokens2 = tokenizer(text2)
assert tokens1[0].text == "Lorem"
def test_cnts3(en_tokenizer): assert tokens2[0].text == "Lorem"
text = u"“Isn't it?”"
tokens = en_tokenizer(text)
words = [t.orth_ for t in tokens]
assert len(words) == 6
def test_cnts4(en_tokenizer):
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
tokens = en_tokenizer(text)
words = [t.orth_ for t in tokens]
assert len(words) == 15
def test_cnts5(en_tokenizer):
text = """'Me too!', Mr. P. Delaware cried. """
tokens = en_tokenizer(text)
assert len(tokens) == 11
@pytest.mark.xfail
def test_mr(en_tokenizer):
text = """Today is Tuesday.Mr."""
tokens = en_tokenizer(text)
assert len(tokens) == 5
assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
def test_cnts6(en_tokenizer):
text = u'They ran about 10km.'
tokens = en_tokenizer(text)
words = [t.orth_ for t in tokens]
assert len(words) == 6
def test_bracket_period(en_tokenizer):
text = u'(And a 6a.m. run through Washington Park).'
tokens = en_tokenizer(text)
assert tokens[len(tokens) - 1].orth_ == u'.'
def test_ie(en_tokenizer):
text = u"It's mediocre i.e. bad."
tokens = en_tokenizer(text)
assert len(tokens) == 6
assert tokens[3].orth_ == "i.e."
def test_two_whitespace(en_tokenizer):
orig_str = u'there are 2 spaces after this '
tokens = en_tokenizer(orig_str)
assert repr(tokens.text_with_ws) == repr(orig_str)
@pytest.mark.xfail
def test_em_dash_infix(en_tokenizer):
# Re Issue #225
tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, '''
'''you'll have to walk there.\u2014Ariel.''')
assert tokens[6].text == 'Puddleton'
assert tokens[7].text == '?'
assert tokens[8].text == '\u2014'
#def test_cnts7():
# text = 'But then the 6,000-year ice age came...'
# tokens = EN.tokenize(text)
# assert len(tokens) == 10

View File

@ -1,67 +1,51 @@
# coding: utf-8
"""Test that tokens are created correctly for whitespace.""" """Test that tokens are created correctly for whitespace."""
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
def test_single_space(en_tokenizer): @pytest.mark.parametrize('text', ["lorem ipsum"])
tokens = en_tokenizer('hello possums') def test_tokenizer_splits_single_space(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 2 assert len(tokens) == 2
def test_double_space(en_tokenizer): @pytest.mark.parametrize('text', ["lorem ipsum"])
tokens = en_tokenizer('hello possums') def test_tokenizer_splits_double_space(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[1].orth_ == ' ' assert tokens[1].text == " "
def test_newline(en_tokenizer): @pytest.mark.parametrize('text', ["lorem ipsum "])
tokens = en_tokenizer('hello\npossums') def test_tokenizer_handles_double_trainling_ws(tokenizer, text):
tokens = tokenizer(text)
assert repr(tokens.text_with_ws) == repr(text)
@pytest.mark.parametrize('text', ["lorem\nipsum"])
def test_tokenizer_splits_newline(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == "\n"
@pytest.mark.parametrize('text', ["lorem \nipsum"])
def test_tokenizer_splits_newline_space(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3
def test_newline_space(en_tokenizer): @pytest.mark.parametrize('text', ["lorem \nipsum"])
tokens = en_tokenizer('hello \npossums') def test_tokenizer_splits_newline_double_space(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3
def test_newline_double_space(en_tokenizer): @pytest.mark.parametrize('text', ["lorem \n ipsum"])
tokens = en_tokenizer('hello \npossums') def test_tokenizer_splits_newline_space_wrap(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3
def test_newline_space_wrap(en_tokenizer):
tokens = en_tokenizer('hello \n possums')
assert len(tokens) == 3
def test_leading_space_offsets(en_tokenizer):
'''Issue #351
# this works
text1 = u"This is a cat."
a = english_spacy(text1)
tok0 = list(a.sents)[0][0]
print tok0, tok0.idx, text1[tok0.idx]
tok1 = list(a.sents)[0][1]
print tok1, tok1.idx, text1[tok1.idx]
print "=="
# this does not work
text2 = u" This is a cat."
b = english_spacy(text2)
tok0 = list(b.sents)[0][0]
print tok0, tok0.idx, text2[tok0.idx]
tok1 = list(b.sents)[0][1]
print tok1, tok1.idx, text2[tok1.idx]
'''
doc = en_tokenizer(u" This is a cat.")
assert doc[0].idx == 0
assert len(doc[0]) == 3
assert doc[1].idx == 3

View File

@ -1,21 +0,0 @@
from __future__ import unicode_literals
from spacy.util import utf8open
import pytest
from os import path
HERE = path.dirname(__file__)
@pytest.fixture
def sun_txt():
loc = path.join(HERE, '..', 'sun.txt')
return utf8open(loc).read()
def test_tokenize(sun_txt, en_tokenizer):
assert len(sun_txt) != 0
tokens = en_tokenizer(sun_txt)
assert len(tokens) > 100

View File

@ -1,20 +0,0 @@
from __future__ import unicode_literals
import pytest
import os
@pytest.fixture(scope='session')
def nlp():
from spacy.en import English
if os.environ.get('SPACY_DATA'):
data_dir = os.environ.get('SPACY_DATA')
else:
data_dir = True
return English(path=data_dir)
@pytest.fixture()
def doc(nlp):
for word in ['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']:
_ = nlp.vocab[word]
return nlp('Hello, world. Here are two sentences.')

View File

@ -1,172 +0,0 @@
from __future__ import unicode_literals
import pytest
from spacy.attrs import HEAD
import numpy
@pytest.mark.xfail
def test_example_war_and_peace(nlp):
# from spacy.en import English
from spacy._doc_examples import download_war_and_peace
unprocessed_unicode = download_war_and_peace()
# nlp = English()
# TODO: ImportError: No module named _doc_examples
doc = nlp(unprocessed_unicode)
def test_main_entry_point(nlp):
# from spacy.en import English
# nlp = English()
doc = nlp('Some text.') # Applies tagger, parser, entity
doc = nlp('Some text.', parse=False) # Applies tagger and entity, not parser
doc = nlp('Some text.', entity=False) # Applies tagger and parser, not entity
doc = nlp('Some text.', tag=False) # Does not apply tagger, entity or parser
doc = nlp('') # Zero-length tokens, not an error
# doc = nlp(b'Some text') <-- Error: need unicode
doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
@pytest.mark.models
def test_sentence_spans(nlp):
# from spacy.en import English
# nlp = English()
doc = nlp("This is a sentence. Here's another...")
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
@pytest.mark.models
def test_entity_spans(nlp):
# from spacy.en import English
# nlp = English()
tokens = nlp('Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents)
assert ents[0].label == 346
assert ents[0].label_ == 'PERSON'
assert ents[0].orth_ == 'Best'
assert ents[0].string == ents[0].string
@pytest.mark.models
def test_noun_chunk_spans(nlp):
# from spacy.en import English
# nlp = English()
doc = nlp('The sentence in this example has three noun chunks.')
for chunk in doc.noun_chunks:
print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
# NP The sentence <-- has
# NP this example <-- in
# NP three noun chunks <-- has
@pytest.mark.models
def test_count_by(nlp):
# from spacy.en import English, attrs
# nlp = English()
import numpy
from spacy import attrs
tokens = nlp('apple apple orange banana')
assert tokens.count_by(attrs.ORTH) == {3699: 2, 3750: 1, 5965: 1}
assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[3699],
[3699],
[3750],
[5965]], dtype=numpy.int32))
@pytest.mark.models
def test_read_bytes(nlp):
from spacy.tokens.doc import Doc
loc = 'test_serialize.bin'
with open(loc, 'wb') as file_:
file_.write(nlp(u'This is a document.').to_bytes())
file_.write(nlp(u'This is another.').to_bytes())
docs = []
with open(loc, 'rb') as file_:
for byte_string in Doc.read_bytes(file_):
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
assert len(docs) == 2
def test_token_span(doc):
span = doc[4:6]
token = span[0]
assert token.i == 4
@pytest.mark.models
def test_example_i_like_new_york1(nlp):
toks = nlp('I like New York in Autumn.')
@pytest.fixture
def toks(nlp):
doc = nlp('I like New York in Autumn.')
doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
return doc
def test_example_i_like_new_york2(toks):
i, like, new, york, in_, autumn, dot = range(len(toks))
@pytest.fixture
def tok(toks, tok):
i, like, new, york, in_, autumn, dot = range(len(toks))
return locals()[tok]
@pytest.fixture
def new(toks):
return tok(toks, "new")
@pytest.fixture
def york(toks):
return tok(toks, "york")
@pytest.fixture
def autumn(toks):
return tok(toks, "autumn")
@pytest.fixture
def dot(toks):
return tok(toks, "dot")
def test_example_i_like_new_york3(toks, new, york):
assert toks[new].head.orth_ == 'York'
assert toks[york].head.orth_ == 'like'
def test_example_i_like_new_york4(toks, new, york):
new_york = toks[new:york+1]
assert new_york.root.orth_ == 'York'
def test_example_i_like_new_york5(toks, autumn, dot):
assert toks[autumn].head.orth_ == 'in'
assert toks[dot].head.orth_ == 'like'
autumn_dot = toks[autumn:]
assert autumn_dot.root.orth_ == 'Autumn'
def test_navigating_the_parse_tree_lefts(doc):
# TODO: where does the span object come from?
span = doc[:2]
lefts = [span.doc[i] for i in range(0, span.start)
if span.doc[i].head in span]
def test_navigating_the_parse_tree_rights(doc):
span = doc[:2]
rights = [span.doc[i] for i in range(span.end, len(span.doc))
if span.doc[i].head in span]
def test_string_store(doc):
string_store = doc.vocab.strings
for i, string in enumerate(string_store):
assert i == string_store[string]

View File

@ -1,180 +0,0 @@
from __future__ import unicode_literals
import pytest
import spacy
import os
try:
xrange
except NameError:
xrange = range
@pytest.fixture()
def token(doc):
return doc[0]
@pytest.mark.models
def test_load_resources_and_process_text():
from spacy.en import English
nlp = English()
doc = nlp(u'Hello, world. Here are two sentences.')
@pytest.mark.models
def test_get_tokens_and_sentences(doc):
token = doc[0]
sentence = next(doc.sents)
assert token is sentence[0]
assert sentence.text == 'Hello, world.'
@pytest.mark.models
def test_use_integer_ids_for_any_strings(nlp, token):
hello_id = nlp.vocab.strings['Hello']
hello_str = nlp.vocab.strings[hello_id]
assert token.orth == hello_id == 3125
assert token.orth_ == hello_str == 'Hello'
def test_get_and_set_string_views_and_flags(nlp, token):
assert token.shape_ == 'Xxxxx'
for lexeme in nlp.vocab:
if lexeme.is_alpha:
lexeme.shape_ = 'W'
elif lexeme.is_digit:
lexeme.shape_ = 'D'
elif lexeme.is_punct:
lexeme.shape_ = 'P'
else:
lexeme.shape_ = 'M'
assert token.shape_ == 'W'
def test_export_to_numpy_arrays(nlp, doc):
from spacy.attrs import ORTH, LIKE_URL, IS_OOV
attr_ids = [ORTH, LIKE_URL, IS_OOV]
doc_array = doc.to_array(attr_ids)
assert doc_array.shape == (len(doc), len(attr_ids))
assert doc[0].orth == doc_array[0, 0]
assert doc[1].orth == doc_array[1, 0]
assert doc[0].like_url == doc_array[0, 1]
assert list(doc_array[:, 1]) == [t.like_url for t in doc]
@pytest.mark.models
def test_word_vectors(nlp):
doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
apples = doc[0]
oranges = doc[2]
boots = doc[6]
hippos = doc[8]
assert apples.similarity(oranges) > boots.similarity(hippos)
@pytest.mark.models
def test_part_of_speech_tags(nlp):
from spacy.parts_of_speech import ADV
def is_adverb(token):
return token.pos == spacy.parts_of_speech.ADV
# These are data-specific, so no constants are provided. You have to look
# up the IDs from the StringStore.
NNS = nlp.vocab.strings['NNS']
NNPS = nlp.vocab.strings['NNPS']
def is_plural_noun(token):
return token.tag == NNS or token.tag == NNPS
def print_coarse_pos(token):
print(token.pos_)
def print_fine_pos(token):
print(token.tag_)
@pytest.mark.models
def test_syntactic_dependencies():
def dependency_labels_to_root(token):
'''Walk up the syntactic tree, collecting the arc labels.'''
dep_labels = []
while token.head is not token:
dep_labels.append(token.dep)
token = token.head
return dep_labels
@pytest.mark.models
def test_named_entities():
def iter_products(docs):
for doc in docs:
for ent in doc.ents:
if ent.label_ == 'PRODUCT':
yield ent
def word_is_in_entity(word):
return word.ent_type != 0
def count_parent_verb_by_person(docs):
counts = defaultdict(defaultdict(int))
for doc in docs:
for ent in doc.ents:
if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
counts[ent.orth_][ent.root.head.lemma_] += 1
return counts
def test_calculate_inline_mark_up_on_original_string():
def put_spans_around_tokens(doc, get_classes):
'''Given some function to compute class names, put each token in a
span element, with the appropriate classes computed.
All whitespace is preserved, outside of the spans. (Yes, I know HTML
won't display it. But the point is no information is lost, so you can
calculate what you need, e.g. <br /> tags, <p> tags, etc.)
'''
output = []
template = '<span classes="{classes}">{word}</span>{space}'
for token in doc:
if token.is_space:
output.append(token.orth_)
else:
output.append(
template.format(
classes=' '.join(get_classes(token)),
word=token.orth_,
space=token.whitespace_))
string = ''.join(output)
string = string.replace('\n', '')
string = string.replace('\t', ' ')
return string
@pytest.mark.models
def test_efficient_binary_serialization(doc):
from spacy.tokens.doc import Doc
byte_string = doc.to_bytes()
open('moby_dick.bin', 'wb').write(byte_string)
nlp = spacy.en.English()
for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
doc = Doc(nlp.vocab)
doc.from_bytes(byte_string)
@pytest.mark.models
def test_multithreading(nlp):
texts = [u'One document.', u'...', u'Lots of documents']
# .pipe streams input, and produces streaming output
iter_texts = (texts[i % 3] for i in xrange(100000000))
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
assert doc.is_parsed
if i == 100:
break

View File

@ -94,8 +94,13 @@ def read_regex(path):
def compile_prefix_regex(entries): def compile_prefix_regex(entries):
if '(' in entries:
# Handle deprecated data
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return re.compile(expression) return re.compile(expression)
else:
expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
return re.compile(expression)
def compile_suffix_regex(entries): def compile_suffix_regex(entries):

View File

@ -22,7 +22,8 @@
"twitter": "spacy_io", "twitter": "spacy_io",
"github": "explosion", "github": "explosion",
"reddit": "spacynlp", "reddit": "spacynlp",
"codepen": "explosion" "codepen": "explosion",
"gitter": "explosion/spaCy"
}, },
"NAVIGATION": { "NAVIGATION": {
@ -53,7 +54,7 @@
} }
}, },
"V_CSS": "1.10", "V_CSS": "1.14",
"V_JS": "1.0", "V_JS": "1.0",
"DEFAULT_SYNTAX" : "python", "DEFAULT_SYNTAX" : "python",
"ANALYTICS": "UA-58931649-1", "ANALYTICS": "UA-58931649-1",

View File

@ -1,6 +1,7 @@
//- 💫 MIXINS > BASE //- 💫 MIXINS > BASE
//- Aside wrapper //- Aside wrapper
label - [string] aside label
mixin aside-wrapper(label) mixin aside-wrapper(label)
aside.c-aside aside.c-aside
@ -21,6 +22,10 @@ mixin date(input)
//- SVG from map //- SVG from map
file - [string] SVG file name in /assets/img/
name - [string] SVG symbol id
width - [integer] width in px
height - [integer] height in px (default: same as width)
mixin svg(file, name, width, height) mixin svg(file, name, width, height)
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes) svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
@ -28,19 +33,23 @@ mixin svg(file, name, width, height)
//- Icon //- Icon
name - [string] icon name, should be SVG symbol ID
size - [integer] icon width and height (default: 20)
mixin icon(name, size) mixin icon(name, size)
+svg("icons", "icon-" + name, size || 20).o-icon&attributes(attributes) +svg("icons", name, size || 20).o-icon&attributes(attributes)
//- Pro/Con/Neutral icon //- Pro/Con/Neutral icon
icon - [string] "pro", "con" or "neutral" (default: "neutral")
mixin procon(icon) mixin procon(icon)
- colors = { pro: "green", con: "red" } - colors = { pro: "green", con: "red", neutral: "yellow" }
+icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) +icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
//- Headlines Helper Mixin //- Headlines Helper Mixin
level - [integer] 1, 2, 3, 4, or 5
mixin headline(level) mixin headline(level)
if level == 1 if level == 1
@ -65,6 +74,7 @@ mixin headline(level)
//- Permalink rendering //- Permalink rendering
id - [string] permalink ID used for link anchor
mixin permalink(id) mixin permalink(id)
if id if id
@ -77,6 +87,7 @@ mixin permalink(id)
//- Terminal-style code window //- Terminal-style code window
label - [string] title displayed in top bar of terminal window
mixin terminal(label) mixin terminal(label)
.x-terminal .x-terminal
@ -87,6 +98,18 @@ mixin terminal(label)
block block
//- Gitter chat button and widget
button - [string] text shown on button
label - [string] title of chat window (default: same as button)
mixin gitter(button, label)
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
button.js-gitter-button.c-chat__button.u-text-small
+icon("chat").o-icon--inline
!=button
//- Logo //- Logo
mixin logo() mixin logo()

View File

@ -44,7 +44,7 @@ mixin api(path)
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block
block block
| #[+icon("book", 18).o-icon--inline.u-help.u-color-subtle] | #[+icon("book", 18).o-icon--inline.u-color-subtle]
//- Aside for text //- Aside for text

View File

@ -24,4 +24,6 @@ main.o-main.o-main--sidebar.o-main--aside
.o-inline-list .o-inline-list
+button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)] +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)]
+gitter("spaCy chat")
include _footer include _footer

View File

@ -0,0 +1,23 @@
//- 💫 INCLUDES > SCRIPTS
script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
script(src="/assets/js/prism.js", type="text/javascript")
if SECTION == "docs"
script.
((window.gitter = {}).chat = {}).options = {
useStyles: false,
activationElement: '.js-gitter-button',
targetElement: '.js-gitter',
room: '!{SOCIAL.gitter}'
};
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
if environment == "deploy"
script
| window.ga=window.ga||function(){
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
script(async src="https://www.google-analytics.com/analytics.js")

View File

@ -52,13 +52,4 @@ html(lang="en")
main!=yield main!=yield
include _includes/_footer include _includes/_footer
script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript") include _includes/_scripts
script(src="/assets/js/prism.js", type="text/javascript")
if environment == "deploy"
script
| window.ga=window.ga||function(){
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
script(async src="https://www.google-analytics.com/analytics.js")

View File

@ -6,29 +6,29 @@
font-family: "Source Sans Pro" font-family: "Source Sans Pro"
font-style: normal font-style: normal
font-weight: 400 font-weight: 400
src: url("../fonts/sourcesanspro-regular.eot") src: url("/assets/fonts/sourcesanspro-regular.eot")
src: url("../fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-regular.woff2") format("woff2"), url("../fonts/sourcesanspro-regular.woff") format("woff"), url("../fonts/sourcesanspro-regular.ttf") format("truetype"), url("../fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg") src: url("/assets/fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-regular.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-regular.woff") format("woff"), url("/assets/fonts/sourcesanspro-regular.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg")
@font-face @font-face
font-family: "Source Sans Pro" font-family: "Source Sans Pro"
font-style: italic font-style: italic
font-weight: 400 font-weight: 400
src: url("../fonts/sourcesanspro-italic.eot") src: url("/assets/fonts/sourcesanspro-italic.eot")
src: url("../fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-italic.woff2") format("woff2"), url("../fonts/sourcesanspro-italic.woff") format("woff"), url("../fonts/sourcesanspro-italic.ttf") format("truetype"), url("../fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg") src: url("/assets/fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-italic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-italic.woff") format("woff"), url("/assets/fonts/sourcesanspro-italic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg")
@font-face @font-face
font-family: "Source Sans Pro" font-family: "Source Sans Pro"
font-style: normal font-style: normal
font-weight: 700 font-weight: 700
src: url("../fonts/sourcesanspro-bold.eot") src: url("/assets/fonts/sourcesanspro-bold.eot")
src: url("../fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bold.woff2") format("woff2"), url("../fonts/sourcesanspro-bold.woff") format("woff"), url("../fonts/sourcesanspro-bold.ttf") format("truetype"), url("../fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg") src: url("/assets/fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bold.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bold.woff") format("woff"), url("/assets/fonts/sourcesanspro-bold.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg")
@font-face @font-face
font-family: "Source Sans Pro" font-family: "Source Sans Pro"
font-style: italic font-style: italic
font-weight: 700 font-weight: 700
src: url("../fonts/sourcesanspro-bolditalic.eot") src: url("/assets/fonts/sourcesanspro-bolditalic.eot")
src: url("../fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("../fonts/sourcesanspro-bolditalic.woff") format("woff"), url("../fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("../fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg") src: url("/assets/fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bolditalic.woff") format("woff"), url("/assets/fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg")
// Source Code Pro // Source Code Pro
@ -37,5 +37,5 @@
font-family: "Source Code Pro" font-family: "Source Code Pro"
font-style: normal font-style: normal
font-weight: 600 font-weight: 600
src: url("../fonts/sourcecodepro-semibold.eot") src: url("/assets/fonts/sourcecodepro-semibold.eot")
src: url("../fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcecodepro-semibold.woff") format("woff"), url("../fonts/sourcecodepro-semibold.ttf") format("truetype"), url("../fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg") src: url("/assets/fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcecodepro-semibold.woff") format("woff"), url("/assets/fonts/sourcecodepro-semibold.ttf") format("truetype"), url("/assets/fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg")

View File

@ -60,7 +60,7 @@
background: $color-back background: $color-back
border-radius: 2px border-radius: 2px
border: 1px solid $color-subtle border: 1px solid $color-subtle
padding: 3.5% 2.5% padding: 3rem 2.5%
//- Icons //- Icons

View File

@ -141,12 +141,6 @@
background: $pattern background: $pattern
//- Cursors
.u-help
cursor: help
//- Hidden elements //- Hidden elements
.u-hidden .u-hidden

View File

@ -0,0 +1,100 @@
//- 💫 CSS > COMPONENTS > CHAT
.c-chat
@include position(fixed, top, left, 0, 60%)
bottom: 0
right: 0
display: flex
flex-flow: column nowrap
background: $color-back
transition: transform 0.3s cubic-bezier(0.16, 0.22, 0.22, 1.7)
box-shadow: -0.25rem 0 1rem 0 rgba($color-front, 0.25)
z-index: 100
@include breakpoint(min, md)
left: calc(100% - #{$aside-width} - #{$aside-padding})
@include breakpoint(max, sm)
left: 50%
@include breakpoint(max, xs)
left: 0
&.is-collapsed:not(.is-loading)
transform: translateX(110%)
&:before
@include position(absolute, top, left, 1rem, 2rem)
content: attr(data-title)
font: bold 1.4rem $font-code
text-transform: uppercase
color: $color-back
&:after
@include position(absolute, top, left, 0, 100%)
content: ""
z-index: -1
bottom: 0
right: -100%
background: $color-back
& > iframe
width: 100%
flex: 1 1 calc(100% - #{$nav-height})
border: 0
.gitter-chat-embed-loading-wrapper
@include position(absolute, top, left, 0, 0)
right: 0
bottom: 0
display: none
justify-content: center
align-items: center
.is-loading &
display: flex
.gitter-chat-embed-action-bar,
.gitter-chat-embed-action-bar-item
display: flex
.gitter-chat-embed-action-bar
align-items: center
justify-content: flex-end
background: $color-theme
padding: 0 1rem 0 2rem
flex: 0 0 $nav-height
.gitter-chat-embed-action-bar-item
@include size(40px)
padding: 0
opacity: 0.75
background-position: 50%
background-repeat: no-repeat
background-size: 22px 22px
border: 0
cursor: pointer
transition: all 0.2s ease
&:focus,
&:hover
opacity: 1
&.gitter-chat-embed-action-bar-item-pop-out
background-image: url()
margin-right: -4px
&.gitter-chat-embed-action-bar-item-collapse-chat
background-image: url()
.c-chat__button
@include position(fixed, bottom, right, 0, 2rem)
padding: 1rem 1.5rem
background: $color-front
color: $color-back
border-top-left-radius: 4px
border-top-right-radius: 4px
z-index: 20
border-color: $color-theme
border-style: solid
border-width: 1px 1px 0 1px

View File

@ -24,6 +24,7 @@ $theme: blue !default
@import _components/asides @import _components/asides
@import _components/buttons @import _components/buttons
@import _components/chat
@import _components/code @import _components/code
@import _components/landing @import _components/landing
@import _components/lists @import _components/lists

View File

@ -64,5 +64,6 @@
<symbol id="matt-signature" viewBox="0 0 500 250"> <symbol id="matt-signature" viewBox="0 0 500 250">
<title>matt-signature</title> <title>matt-signature</title>
<path fill="currentColor" d="M18.6 207c-.3-18.8-.8-37.5-1.4-56.2-.6-18.7-1-37.5-1-56.2v-7.2c0-3.5 0-7 .2-11v-18c.8-2.7 1.8-5 3-6.5 1.6-2 3.6-3 6.4-3 3 0 5.4 1 7.6 2 2.2 2 4 4 5.3 6l36.6 71 1.8 3c1 1 2 3 3 3h1l1 1 1-3 22-76c2-3 3-5 4-8l2-9c1-3 2-6 4-8 1-3 4-5 7-7h2c5 0 8 1 10 4 3 2 4 5 5 9 1 3 2 7 1 12v11l1 7c0 3 0 7 1 12 0 4 1 9 1 14l1 14.2 1 12 .6 6v1l1 7.5 1 11.6 1.4 12 1.4 8 1 4 1.7 5.5 1.7 6c.7 1.7 1 3 1.5 3.6-.5 4-1.5 7-3 9-1 2-4 3-8 3h-6l-3-3c-1-1.4-2-2.3-2-3l-4-14-7.6-58V88c0-3.5-1-7-2-10l-2 1.7-18 74v6c0 2-.2 4-1 6 0 2-1 3.5-3 5-1 1.3-3 2-5 2.2-1 0-2 0-3-1l-3.4-2-3-3c-1-1-1.7-2-2-3l-35-52-5.3-10.6v22c0 10.2.2 20.3.6 30.2.4 10 .6 20 .6 30.2v22c0 2-1 4-3 5.4s-3 3-5 3c-3 0-5 0-7-1-1-1-3-3-4-5zm205-63.2c-1.6 2.7-3.4 6-5.3 9.8l-6.2 12.2c-2 4.3-4 8.6-7 13-2 4.2-5 8.2-8 11.7s-5 6.6-9 9c-3 2.5-6 4-9 4.4-1 0-3-1-4-1l-5-2c-1-1-3-2-4-3s-1-3-1-5c1-18 2-33 4-47s6-27 11-38 12-20 20-27 18-12 29-15l2-1h2c5 0 9 2 11 7s4 12 5 23c1 10 2 24 2 40 1 16 2 36 3 59l1 4v5c0 2.6-1 4.5-2 6s-3 2-5 2c-5 0-8-1.7-10-4s-3-6.6-4-11v-4l-1-9s-1-6.7-1-10l-1-8.5v-1l-.2-6-1-7-.5-8.6-1-1zM218 93.5c-4.7 3.4-9.2 8-13.6 13.7-4.4 5.8-7.5 11.3-9.4 16.8-.8 2.5-1.8 6-2.8 10.4-1 4.4-2 8.8-2.7 13l-2 12-.7 7c.2 0 .4-.2.6-.5l.6-1c10.5-10 18-21 22.2-33 4.6-12 7-25 7.7-39zm72 47c-2.3 0-4.4.6-6.2 1.8-2 1.2-4 1.8-6.6 1.8h-5.4c-.7-1-1.4-1-2.3-2l-2.5-2c-.8 0-1.6-1-2.2-2-.6-1-1-2-1-3 0-2 1-4 3-6 2-1 4.5-3 7.2-4l8.3-3s5-2 6.7-3v-11c0-12-.6-25-1.8-38-1.2-12-1.8-25-1.8-37 0-3 .8-6 2.5-7 1-1 4-1 6-1 3 0 6 1 7 3s2 4 3 7c0 3 1 6 1 9v20l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2.3 9-3.4 14-3.4 3 0 6 1 7 3.5s3 5 3 8c0 2-1 4-3 5l-6 3-46 17-1.5 1s-1 0-1.5 1v8c0 6 0 12 .5 18s1 12.3 2 18.3l3 15c1 5 1.4 10 1.4 15 0 1.4-.6 3.5-1.6 6s-2 4-4.7 4c-5 0-8.7-1.6-11.6-4-3-3-4.3-6.6-4.6-11l-2.2-29-2.7-30h-1zm112 0c-2.4 0-4.5.6-6.3 1.8-2 1.2-4 1.8-6.6 1.8h-5c0-1-1-1-2-2l-2-2c-1 0-1-1-2-2 0-1-1-2-1-3 0-2 1-4 3-6 2-1 5-3 7-4l8-3s5-2 7-3v-11c0-12 0-25-2-38-1-12-1-25-1-37 0-3 1-6 3-7s4-1 7-1c4 0 6 1 8 3s3 4 3 7c1 3 1 6 1 9s0 6 1 8v11l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2 9-4 14-4 4 0 6 1 8 4s3 5 3 8c0 2-1 4-2 5l-5.3 3-49 13.8-1.5 1s-1 .5-1.5 1V157l1 18.3c0 5 1 10 2 15s1 10 1 15c0 1.5-1 3.6-2 6s-3 4-5 4c-5 0-9-1.5-12-4.2s-5-6-5-11l-3-28.3-3-30.3h-1z"/> <path fill="currentColor" d="M18.6 207c-.3-18.8-.8-37.5-1.4-56.2-.6-18.7-1-37.5-1-56.2v-7.2c0-3.5 0-7 .2-11v-18c.8-2.7 1.8-5 3-6.5 1.6-2 3.6-3 6.4-3 3 0 5.4 1 7.6 2 2.2 2 4 4 5.3 6l36.6 71 1.8 3c1 1 2 3 3 3h1l1 1 1-3 22-76c2-3 3-5 4-8l2-9c1-3 2-6 4-8 1-3 4-5 7-7h2c5 0 8 1 10 4 3 2 4 5 5 9 1 3 2 7 1 12v11l1 7c0 3 0 7 1 12 0 4 1 9 1 14l1 14.2 1 12 .6 6v1l1 7.5 1 11.6 1.4 12 1.4 8 1 4 1.7 5.5 1.7 6c.7 1.7 1 3 1.5 3.6-.5 4-1.5 7-3 9-1 2-4 3-8 3h-6l-3-3c-1-1.4-2-2.3-2-3l-4-14-7.6-58V88c0-3.5-1-7-2-10l-2 1.7-18 74v6c0 2-.2 4-1 6 0 2-1 3.5-3 5-1 1.3-3 2-5 2.2-1 0-2 0-3-1l-3.4-2-3-3c-1-1-1.7-2-2-3l-35-52-5.3-10.6v22c0 10.2.2 20.3.6 30.2.4 10 .6 20 .6 30.2v22c0 2-1 4-3 5.4s-3 3-5 3c-3 0-5 0-7-1-1-1-3-3-4-5zm205-63.2c-1.6 2.7-3.4 6-5.3 9.8l-6.2 12.2c-2 4.3-4 8.6-7 13-2 4.2-5 8.2-8 11.7s-5 6.6-9 9c-3 2.5-6 4-9 4.4-1 0-3-1-4-1l-5-2c-1-1-3-2-4-3s-1-3-1-5c1-18 2-33 4-47s6-27 11-38 12-20 20-27 18-12 29-15l2-1h2c5 0 9 2 11 7s4 12 5 23c1 10 2 24 2 40 1 16 2 36 3 59l1 4v5c0 2.6-1 4.5-2 6s-3 2-5 2c-5 0-8-1.7-10-4s-3-6.6-4-11v-4l-1-9s-1-6.7-1-10l-1-8.5v-1l-.2-6-1-7-.5-8.6-1-1zM218 93.5c-4.7 3.4-9.2 8-13.6 13.7-4.4 5.8-7.5 11.3-9.4 16.8-.8 2.5-1.8 6-2.8 10.4-1 4.4-2 8.8-2.7 13l-2 12-.7 7c.2 0 .4-.2.6-.5l.6-1c10.5-10 18-21 22.2-33 4.6-12 7-25 7.7-39zm72 47c-2.3 0-4.4.6-6.2 1.8-2 1.2-4 1.8-6.6 1.8h-5.4c-.7-1-1.4-1-2.3-2l-2.5-2c-.8 0-1.6-1-2.2-2-.6-1-1-2-1-3 0-2 1-4 3-6 2-1 4.5-3 7.2-4l8.3-3s5-2 6.7-3v-11c0-12-.6-25-1.8-38-1.2-12-1.8-25-1.8-37 0-3 .8-6 2.5-7 1-1 4-1 6-1 3 0 6 1 7 3s2 4 3 7c0 3 1 6 1 9v20l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2.3 9-3.4 14-3.4 3 0 6 1 7 3.5s3 5 3 8c0 2-1 4-3 5l-6 3-46 17-1.5 1s-1 0-1.5 1v8c0 6 0 12 .5 18s1 12.3 2 18.3l3 15c1 5 1.4 10 1.4 15 0 1.4-.6 3.5-1.6 6s-2 4-4.7 4c-5 0-8.7-1.6-11.6-4-3-3-4.3-6.6-4.6-11l-2.2-29-2.7-30h-1zm112 0c-2.4 0-4.5.6-6.3 1.8-2 1.2-4 1.8-6.6 1.8h-5c0-1-1-1-2-2l-2-2c-1 0-1-1-2-2 0-1-1-2-1-3 0-2 1-4 3-6 2-1 5-3 7-4l8-3s5-2 7-3v-11c0-12 0-25-2-38-1-12-1-25-1-37 0-3 1-6 3-7s4-1 7-1c4 0 6 1 8 3s3 4 3 7c1 3 1 6 1 9s0 6 1 8v11l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2 9-4 14-4 4 0 6 1 8 4s3 5 3 8c0 2-1 4-2 5l-5.3 3-49 13.8-1.5 1s-1 .5-1.5 1V157l1 18.3c0 5 1 10 2 15s1 10 1 15c0 1.5-1 3.6-2 6s-3 4-5 4c-5 0-9-1.5-12-4.2s-5-6-5-11l-3-28.3-3-30.3h-1z"/>
</symbol>
</defs> </defs>
</svg> </svg>

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 15 KiB

View File

@ -1,32 +1,28 @@
<svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> <svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<defs> <defs>
<symbol id="icon-github" viewBox="0 0 27 32"> <symbol id="github" viewBox="0 0 27 32">
<title>github</title> <path d="M13.714 2.286q3.732 0 6.884 1.839t4.991 4.991 1.839 6.884q0 4.482-2.616 8.063t-6.759 4.955q-0.482 0.089-0.714-0.125t-0.232-0.536q0-0.054 0.009-1.366t0.009-2.402q0-1.732-0.929-2.536 1.018-0.107 1.83-0.321t1.679-0.696 1.446-1.188 0.946-1.875 0.366-2.688q0-2.125-1.411-3.679 0.661-1.625-0.143-3.643-0.5-0.161-1.446 0.196t-1.643 0.786l-0.679 0.429q-1.661-0.464-3.429-0.464t-3.429 0.464q-0.286-0.196-0.759-0.482t-1.491-0.688-1.518-0.241q-0.804 2.018-0.143 3.643-1.411 1.554-1.411 3.679 0 1.518 0.366 2.679t0.938 1.875 1.438 1.196 1.679 0.696 1.83 0.321q-0.696 0.643-0.875 1.839-0.375 0.179-0.804 0.268t-1.018 0.089-1.17-0.384-0.991-1.116q-0.339-0.571-0.866-0.929t-0.884-0.429l-0.357-0.054q-0.375 0-0.518 0.080t-0.089 0.205 0.161 0.25 0.232 0.214l0.125 0.089q0.393 0.179 0.777 0.679t0.563 0.911l0.179 0.411q0.232 0.679 0.786 1.098t1.196 0.536 1.241 0.125 0.991-0.063l0.411-0.071q0 0.679 0.009 1.58t0.009 0.973q0 0.321-0.232 0.536t-0.714 0.125q-4.143-1.375-6.759-4.955t-2.616-8.063q0-3.732 1.839-6.884t4.991-4.991 6.884-1.839zM5.196 21.982q0.054-0.125-0.125-0.214-0.179-0.054-0.232 0.036-0.054 0.125 0.125 0.214 0.161 0.107 0.232-0.036zM5.75 22.589q0.125-0.089-0.036-0.286-0.179-0.161-0.286-0.054-0.125 0.089 0.036 0.286 0.179 0.179 0.286 0.054zM6.286 23.393q0.161-0.125 0-0.339-0.143-0.232-0.304-0.107-0.161 0.089 0 0.321t0.304 0.125zM7.036 24.143q0.143-0.143-0.071-0.339-0.214-0.214-0.357-0.054-0.161 0.143 0.071 0.339 0.214 0.214 0.357 0.054zM8.054 24.589q0.054-0.196-0.232-0.286-0.268-0.071-0.339 0.125t0.232 0.268q0.268 0.107 0.339-0.107zM9.179 24.679q0-0.232-0.304-0.196-0.286 0-0.286 0.196 0 0.232 0.304 0.196 0.286 0 0.286-0.196zM10.214 24.5q-0.036-0.196-0.321-0.161-0.286 0.054-0.25 0.268t0.321 0.143 0.25-0.25z"></path>
<path class="path1" d="M13.714 2.286q3.732 0 6.884 1.839t4.991 4.991 1.839 6.884q0 4.482-2.616 8.063t-6.759 4.955q-0.482 0.089-0.714-0.125t-0.232-0.536q0-0.054 0.009-1.366t0.009-2.402q0-1.732-0.929-2.536 1.018-0.107 1.83-0.321t1.679-0.696 1.446-1.188 0.946-1.875 0.366-2.688q0-2.125-1.411-3.679 0.661-1.625-0.143-3.643-0.5-0.161-1.446 0.196t-1.643 0.786l-0.679 0.429q-1.661-0.464-3.429-0.464t-3.429 0.464q-0.286-0.196-0.759-0.482t-1.491-0.688-1.518-0.241q-0.804 2.018-0.143 3.643-1.411 1.554-1.411 3.679 0 1.518 0.366 2.679t0.938 1.875 1.438 1.196 1.679 0.696 1.83 0.321q-0.696 0.643-0.875 1.839-0.375 0.179-0.804 0.268t-1.018 0.089-1.17-0.384-0.991-1.116q-0.339-0.571-0.866-0.929t-0.884-0.429l-0.357-0.054q-0.375 0-0.518 0.080t-0.089 0.205 0.161 0.25 0.232 0.214l0.125 0.089q0.393 0.179 0.777 0.679t0.563 0.911l0.179 0.411q0.232 0.679 0.786 1.098t1.196 0.536 1.241 0.125 0.991-0.063l0.411-0.071q0 0.679 0.009 1.58t0.009 0.973q0 0.321-0.232 0.536t-0.714 0.125q-4.143-1.375-6.759-4.955t-2.616-8.063q0-3.732 1.839-6.884t4.991-4.991 6.884-1.839zM5.196 21.982q0.054-0.125-0.125-0.214-0.179-0.054-0.232 0.036-0.054 0.125 0.125 0.214 0.161 0.107 0.232-0.036zM5.75 22.589q0.125-0.089-0.036-0.286-0.179-0.161-0.286-0.054-0.125 0.089 0.036 0.286 0.179 0.179 0.286 0.054zM6.286 23.393q0.161-0.125 0-0.339-0.143-0.232-0.304-0.107-0.161 0.089 0 0.321t0.304 0.125zM7.036 24.143q0.143-0.143-0.071-0.339-0.214-0.214-0.357-0.054-0.161 0.143 0.071 0.339 0.214 0.214 0.357 0.054zM8.054 24.589q0.054-0.196-0.232-0.286-0.268-0.071-0.339 0.125t0.232 0.268q0.268 0.107 0.339-0.107zM9.179 24.679q0-0.232-0.304-0.196-0.286 0-0.286 0.196 0 0.232 0.304 0.196 0.286 0 0.286-0.196zM10.214 24.5q-0.036-0.196-0.321-0.161-0.286 0.054-0.25 0.268t0.321 0.143 0.25-0.25z"></path>
</symbol> </symbol>
<symbol id="icon-code" viewBox="0 0 20 20"> <symbol id="code" viewBox="0 0 20 20">
<title>code</title> <path d="M5.719 14.75c-0.236 0-0.474-0.083-0.664-0.252l-5.060-4.498 5.341-4.748c0.412-0.365 1.044-0.33 1.411 0.083s0.33 1.045-0.083 1.412l-3.659 3.253 3.378 3.002c0.413 0.367 0.45 0.999 0.083 1.412-0.197 0.223-0.472 0.336-0.747 0.336zM14.664 14.748l5.341-4.748-5.060-4.498c-0.413-0.367-1.045-0.33-1.411 0.083s-0.33 1.045 0.083 1.412l3.378 3.003-3.659 3.252c-0.413 0.367-0.45 0.999-0.083 1.412 0.197 0.223 0.472 0.336 0.747 0.336 0.236 0 0.474-0.083 0.664-0.252zM9.986 16.165l2-12c0.091-0.545-0.277-1.060-0.822-1.151-0.547-0.092-1.061 0.277-1.15 0.822l-2 12c-0.091 0.545 0.277 1.060 0.822 1.151 0.056 0.009 0.11 0.013 0.165 0.013 0.48 0 0.904-0.347 0.985-0.835z"></path>
<path class="path1" d="M5.719 14.75c-0.236 0-0.474-0.083-0.664-0.252l-5.060-4.498 5.341-4.748c0.412-0.365 1.044-0.33 1.411 0.083s0.33 1.045-0.083 1.412l-3.659 3.253 3.378 3.002c0.413 0.367 0.45 0.999 0.083 1.412-0.197 0.223-0.472 0.336-0.747 0.336zM14.664 14.748l5.341-4.748-5.060-4.498c-0.413-0.367-1.045-0.33-1.411 0.083s-0.33 1.045 0.083 1.412l3.378 3.003-3.659 3.252c-0.413 0.367-0.45 0.999-0.083 1.412 0.197 0.223 0.472 0.336 0.747 0.336 0.236 0 0.474-0.083 0.664-0.252zM9.986 16.165l2-12c0.091-0.545-0.277-1.060-0.822-1.151-0.547-0.092-1.061 0.277-1.15 0.822l-2 12c-0.091 0.545 0.277 1.060 0.822 1.151 0.056 0.009 0.11 0.013 0.165 0.013 0.48 0 0.904-0.347 0.985-0.835z"></path>
</symbol> </symbol>
<symbol id="icon-anchor" viewBox="0 0 16 16"> <symbol id="anchor" viewBox="0 0 16 16">
<title>anchor</title> <path d="M14.779 12.779c-1.471 1.993-4.031 3.245-6.779 3.221-2.748 0.023-5.309-1.229-6.779-3.221l-1.221 1.221v-4h4l-1.1 1.099c0.882 1.46 2.357 2.509 4.1 2.807v-6.047c-1.723-0.446-3-1.997-3-3.858 0-2.209 1.791-4 4-4s4 1.791 4 4c0 1.862-1.277 3.413-3 3.858v6.047c1.742-0.297 3.218-1.347 4.099-2.807l-1.1-1.099h4v4l-1.221-1.221zM10 4c0-1.104-0.895-2-2-2s-2 0.895-2 2c0 1.104 0.895 2 2 2s2-0.896 2-2z"></path>
<path class="path1" d="M14.779 12.779c-1.471 1.993-4.031 3.245-6.779 3.221-2.748 0.023-5.309-1.229-6.779-3.221l-1.221 1.221v-4h4l-1.1 1.099c0.882 1.46 2.357 2.509 4.1 2.807v-6.047c-1.723-0.446-3-1.997-3-3.858 0-2.209 1.791-4 4-4s4 1.791 4 4c0 1.862-1.277 3.413-3 3.858v6.047c1.742-0.297 3.218-1.347 4.099-2.807l-1.1-1.099h4v4l-1.221-1.221zM10 4c0-1.104-0.895-2-2-2s-2 0.895-2 2c0 1.104 0.895 2 2 2s2-0.896 2-2z"></path>
</symbol> </symbol>
<symbol id="icon-book" viewBox="0 0 24 24"> <symbol id="book" viewBox="0 0 24 24">
<title>book</title> <path d="M18.984 6.984v-1.969h-9.984v1.969h9.984zM15 15v-2.016h-6v2.016h6zM18.984 11.016v-2.016h-9.984v2.016h9.984zM20.016 2.016c1.078 0 1.969 0.891 1.969 1.969v12c0 1.078-0.891 2.016-1.969 2.016h-12c-1.078 0-2.016-0.938-2.016-2.016v-12c0-1.078 0.938-1.969 2.016-1.969h12zM3.984 6v14.016h14.016v1.969h-14.016c-1.078 0-1.969-0.891-1.969-1.969v-14.016h1.969z"></path>
<path class="path1" d="M18.984 6.984v-1.969h-9.984v1.969h9.984zM15 15v-2.016h-6v2.016h6zM18.984 11.016v-2.016h-9.984v2.016h9.984zM20.016 2.016c1.078 0 1.969 0.891 1.969 1.969v12c0 1.078-0.891 2.016-1.969 2.016h-12c-1.078 0-2.016-0.938-2.016-2.016v-12c0-1.078 0.938-1.969 2.016-1.969h12zM3.984 6v14.016h14.016v1.969h-14.016c-1.078 0-1.969-0.891-1.969-1.969v-14.016h1.969z"></path>
</symbol> </symbol>
<symbol id="icon-pro" viewBox="0 0 20 20"> <symbol id="pro" viewBox="0 0 20 20">
<title>pro</title> <path d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-4v4h-2v-4h-4v-2h4v-4h2v4h4v2z"></path>
<path class="path1" d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-4v4h-2v-4h-4v-2h4v-4h2v4h4v2z"></path>
</symbol> </symbol>
<symbol id="icon-con" viewBox="0 0 20 20"> <symbol id="con" viewBox="0 0 20 20">
<title>con</title> <path d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-10v-2h10v2z"></path>
<path class="path1" d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-10v-2h10v2z"></path>
</symbol> </symbol>
<symbol id="icon-neutral" viewBox="0 0 20 20"> <symbol id="neutral" viewBox="0 0 20 20">
<title>neutral</title> <path d="M9.999 0.8c-5.081 0-9.199 4.119-9.199 9.201 0 5.080 4.118 9.199 9.199 9.199s9.2-4.119 9.2-9.199c0-5.082-4.119-9.201-9.2-9.201zM10 13.001c-1.657 0-3-1.344-3-3s1.343-3 3-3c1.656 0 3 1.344 3 3s-1.344 3-3 3z"></path>
<path class="path1" d="M9.999 0.8c-5.081 0-9.199 4.119-9.199 9.201 0 5.080 4.118 9.199 9.199 9.199s9.2-4.119 9.2-9.199c0-5.082-4.119-9.201-9.2-9.201zM10 13.001c-1.657 0-3-1.344-3-3s1.343-3 3-3c1.656 0 3 1.344 3 3s-1.344 3-3 3z"></path> </symbol>
<symbol id="chat" viewBox="0 0 24 24">
<path d="M18 8.016v-2.016h-12v2.016h12zM18 11.016v-2.016h-12v2.016h12zM18 14.016v-2.016h-12v2.016h12zM21.984 3.984v18l-3.984-3.984h-14.016c-1.078 0-1.969-0.938-1.969-2.016v-12c0-1.078 0.891-1.969 1.969-1.969h16.031c1.078 0 1.969 0.891 1.969 1.969z"></path>
</symbol> </symbol>
</defs> </defs>
</svg> </svg>

Before

Width:  |  Height:  |  Size: 4.7 KiB

After

Width:  |  Height:  |  Size: 4.6 KiB

View File

@ -23,7 +23,7 @@ p
+row +row
+cell Multi-language support +cell Multi-language support
each icon in [ "con", "pro", "pro", "pro" ] each icon in [ "neutral", "pro", "pro", "pro" ]
+cell.u-text-center #[+procon(icon)] +cell.u-text-center #[+procon(icon)]
+row +row

View File

@ -2,8 +2,6 @@
include ../_includes/_mixins include ../_includes/_mixins
p=lorem_short
+aside("Help us improve the docs") +aside("Help us improve the docs")
| Did you spot a mistake or come across explanations that | Did you spot a mistake or come across explanations that
| are unclear? You can find a "Suggest edits" button at the | are unclear? You can find a "Suggest edits" button at the

View File

@ -57,7 +57,7 @@ p
doc.ents = [Span(0, 1, label='GPE')] doc.ents = [Span(0, 1, label='GPE')]
assert doc[0].ent_type_ == 'GPE' assert doc[0].ent_type_ == 'GPE'
doc.ents = [] doc.ents = []
doc.ents = [(u'LondonCity', 0, 1, u'GPE')] doc.ents = [(u'LondonCity', u'GPE', 0, 1)]
p p
| The value you assign should be a sequence, the values of which | The value you assign should be a sequence, the values of which

View File

@ -30,6 +30,13 @@ p Many of the associated tools and resources that we're developing alongside spa
+cell +cell
| REST microservices for spaCy demos and visualisers. | REST microservices for spaCy demos and visualisers.
+row
+cell
+src(gh("spacy-notebooks")) spaCy Notebooks
+cell
| Jupyter notebooks for spaCy examples and tutorials.
+h(2, "libraries") Libraries and projects +h(2, "libraries") Libraries and projects
+table(["Name", "Description"]) +table(["Name", "Description"])
+row +row

View File

@ -141,7 +141,7 @@ p
span.merge(label=label, tag='NNP' if label else span.root.tag_) span.merge(label=label, tag='NNP' if label else span.root.tag_)
matcher.add_entity('GoogleNow', on_match=merge_phrases) matcher.add_entity('GoogleNow', on_match=merge_phrases)
matcher.add_pattern('GoogleNow', {ORTH: 'Google'}, {ORTH: 'Now'}]) matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded']) doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded'])
matcher(doc) matcher(doc)
print([w.text for w in doc]) print([w.text for w in doc])