mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Merge branch 'master' of ssh://github.com/explosion/spaCy
This commit is contained in:
commit
3eb6a929f3
|
@ -33,6 +33,7 @@ We use the following system to tag our issues:
|
|||
| [`install`](https://github.com/explosion/spaCy/labels/install) | Installation problems |
|
||||
| [`performance`](https://github.com/explosion/spaCy/labels/performance) | Accuracy, speed and memory use problems |
|
||||
| [`tests`](https://github.com/explosion/spaCy/labels/tests) | Missing or incorrect [tests](spacy/tests) |
|
||||
| [`english`](https://github.com/explosion/spaCy/labels/english), [`german`](https://github.com/explosion/spaCy/labels/german) | Issues related to the specific languages, models and data |
|
||||
| [`linux`](https://github.com/explosion/spaCy/labels/linux), [`osx`](https://github.com/explosion/spaCy/labels/osx), [`windows`](https://github.com/explosion/spaCy/labels/windows) | Issues related to the specific operating systems |
|
||||
| [`pip`](https://github.com/explosion/spaCy/labels/pip), [`conda`](https://github.com/explosion/spaCy/labels/conda) | Issues related to the specific package managers |
|
||||
| [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before |
|
||||
|
|
|
@ -3,8 +3,10 @@ spaCy: Industrial-strength NLP
|
|||
|
||||
spaCy is a library for advanced natural language processing in Python and
|
||||
Cython. spaCy is built on the very latest research, but it isn't researchware.
|
||||
It was designed from day 1 to be used in real products. It's commercial
|
||||
open-source software, released under the MIT license.
|
||||
It was designed from day one to be used in real products. spaCy currently supports
|
||||
English and German, as well as tokenization for Chinese, Spanish, Italian, French,
|
||||
Portuguese, Dutch, Swedish and Hungarian. It's commercial open-source software,
|
||||
released under the MIT license.
|
||||
|
||||
💫 **Version 1.5 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
||||
|
||||
|
@ -24,7 +26,7 @@ open-source software, released under the MIT license.
|
|||
:target: https://pypi.python.org/pypi/spacy
|
||||
:alt: pypi Version
|
||||
|
||||
.. image:: https://badges.gitter.im/spaCy-users.png
|
||||
.. image:: https://badges.gitter.im/explosion.png
|
||||
:target: https://gitter.im/explosion/spaCy
|
||||
:alt: spaCy on Gitter
|
||||
|
||||
|
|
|
@ -71,6 +71,8 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc):
|
|||
features = get_templates('basic')
|
||||
|
||||
model_dir = pathlib.Path(model_dir)
|
||||
if not (model_dir / 'deps').exists():
|
||||
(model_dir / 'deps').mkdir()
|
||||
with (model_dir / 'deps' / 'config.json').open('w') as file_:
|
||||
json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_)
|
||||
|
||||
|
|
3
setup.py
3
setup.py
|
@ -47,8 +47,7 @@ PACKAGES = [
|
|||
'spacy.tests.tokenizer',
|
||||
'spacy.tests.tokens',
|
||||
'spacy.tests.vectors',
|
||||
'spacy.tests.vocab',
|
||||
'spacy.tests.website']
|
||||
'spacy.tests.vocab']
|
||||
|
||||
|
||||
MOD_NAMES = [
|
||||
|
|
|
@ -9,12 +9,13 @@ from .stop_words import STOP_WORDS
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
TAG_MAP = dict(TAG_MAP)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
|
||||
|
||||
|
|
|
@ -516,11 +516,6 @@ TOKENIZER_EXCEPTIONS = {
|
|||
|
||||
|
||||
ORTH_ONLY = [
|
||||
"'",
|
||||
"\\\")",
|
||||
"<space>",
|
||||
"a.",
|
||||
"ä.",
|
||||
"A.C.",
|
||||
"a.D.",
|
||||
"A.D.",
|
||||
|
@ -530,24 +525,20 @@ ORTH_ONLY = [
|
|||
"Abs.",
|
||||
"adv.",
|
||||
"al.",
|
||||
"b.",
|
||||
"B.A.",
|
||||
"B.Sc.",
|
||||
"betr.",
|
||||
"biol.",
|
||||
"Biol.",
|
||||
"c.",
|
||||
"ca.",
|
||||
"Chr.",
|
||||
"Cie.",
|
||||
"co.",
|
||||
"Co.",
|
||||
"d.",
|
||||
"D.C.",
|
||||
"Dipl.-Ing.",
|
||||
"Dipl.",
|
||||
"Dr.",
|
||||
"e.",
|
||||
"e.g.",
|
||||
"e.V.",
|
||||
"ehem.",
|
||||
|
@ -555,79 +546,57 @@ ORTH_ONLY = [
|
|||
"erm.",
|
||||
"etc.",
|
||||
"ev.",
|
||||
"f.",
|
||||
"g.",
|
||||
"G.m.b.H.",
|
||||
"geb.",
|
||||
"Gebr.",
|
||||
"gem.",
|
||||
"h.",
|
||||
"h.c.",
|
||||
"Hg.",
|
||||
"hrsg.",
|
||||
"Hrsg.",
|
||||
"i.",
|
||||
"i.A.",
|
||||
"i.e.",
|
||||
"i.G.",
|
||||
"i.Tr.",
|
||||
"i.V.",
|
||||
"Ing.",
|
||||
"j.",
|
||||
"jr.",
|
||||
"Jr.",
|
||||
"jun.",
|
||||
"jur.",
|
||||
"k.",
|
||||
"K.O.",
|
||||
"l.",
|
||||
"L.A.",
|
||||
"lat.",
|
||||
"m.",
|
||||
"M.A.",
|
||||
"m.E.",
|
||||
"m.M.",
|
||||
"M.Sc.",
|
||||
"Mr.",
|
||||
"n.",
|
||||
"N.Y.",
|
||||
"N.Y.C.",
|
||||
"nat.",
|
||||
"ö."
|
||||
"o.",
|
||||
"o.a.",
|
||||
"o.ä.",
|
||||
"o.g.",
|
||||
"o.k.",
|
||||
"O.K.",
|
||||
"p.",
|
||||
"p.a.",
|
||||
"p.s.",
|
||||
"P.S.",
|
||||
"pers.",
|
||||
"phil.",
|
||||
"q.",
|
||||
"q.e.d.",
|
||||
"r.",
|
||||
"R.I.P.",
|
||||
"rer.",
|
||||
"s.",
|
||||
"sen.",
|
||||
"St.",
|
||||
"std.",
|
||||
"t.",
|
||||
"u.",
|
||||
"ü.",
|
||||
"u.a.",
|
||||
"U.S.",
|
||||
"U.S.A.",
|
||||
"U.S.S.",
|
||||
"v.",
|
||||
"Vol.",
|
||||
"vs.",
|
||||
"w.",
|
||||
"wiss.",
|
||||
"x.",
|
||||
"y.",
|
||||
"z."
|
||||
"wiss."
|
||||
]
|
||||
|
|
|
@ -37,14 +37,16 @@ def get_time_exc(hours):
|
|||
return exc
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
TAG_MAP = dict(TAG_MAP)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"]
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -40,11 +40,14 @@ def get_time_exc(hours):
|
|||
return exc
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||
|
|
|
@ -85,55 +85,29 @@ TOKENIZER_EXCEPTIONS = {
|
|||
|
||||
|
||||
ORTH_ONLY = [
|
||||
"a.",
|
||||
"a.C.",
|
||||
"a.J.C.",
|
||||
"apdo.",
|
||||
"Av.",
|
||||
"Avda.",
|
||||
"b.",
|
||||
"c.",
|
||||
"Cía.",
|
||||
"d.",
|
||||
"e.",
|
||||
"etc.",
|
||||
"f.",
|
||||
"g.",
|
||||
"Gob.",
|
||||
"Gral.",
|
||||
"h.",
|
||||
"i.",
|
||||
"Ing.",
|
||||
"j.",
|
||||
"J.C.",
|
||||
"k.",
|
||||
"l.",
|
||||
"Lic.",
|
||||
"m.",
|
||||
"m.n.",
|
||||
"n.",
|
||||
"no.",
|
||||
"núm.",
|
||||
"o.",
|
||||
"p.",
|
||||
"P.D.",
|
||||
"Prof.",
|
||||
"Profa.",
|
||||
"q.",
|
||||
"q.e.p.d."
|
||||
"r.",
|
||||
"s.",
|
||||
"S.A.",
|
||||
"S.L.",
|
||||
"s.s.s.",
|
||||
"Sr.",
|
||||
"Sra.",
|
||||
"Srta.",
|
||||
"t.",
|
||||
"u.",
|
||||
"v.",
|
||||
"w.",
|
||||
"x.",
|
||||
"y.",
|
||||
"z."
|
||||
"Srta."
|
||||
]
|
||||
|
|
|
@ -2,13 +2,16 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .. import language_data as base
|
||||
from ..language_data import strings_to_exc
|
||||
from ..language_data import strings_to_exc, update_exc
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||
|
|
|
@ -4,21 +4,25 @@ from __future__ import unicode_literals
|
|||
import six
|
||||
|
||||
from spacy.language_data import strings_to_exc, update_exc
|
||||
from .punctuations import *
|
||||
from .punctuation import *
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import ABBREVIATIONS
|
||||
from .tokenizer_exceptions import OTHER_EXC
|
||||
from .. import language_data as base
|
||||
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES
|
||||
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
|
||||
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
||||
|
||||
# HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]
|
||||
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
|
||||
|
||||
|
||||
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES
|
||||
TOKENIZER_SUFFIXES = base.TOKENIZER_SUFFIXES + TOKENIZER_SUFFIXES
|
||||
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
||||
|
|
25
spacy/hu/punctuation.py
Normal file
25
spacy/hu/punctuation.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..language_data.punctuation import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES
|
||||
|
||||
|
||||
TOKENIZER_SUFFIXES = [
|
||||
r'(?<=[{al})])-e'.format(al=ALPHA_LOWER)
|
||||
]
|
||||
|
||||
TOKENIZER_INFIXES = [
|
||||
r'(?<=[0-9])-(?=[0-9])',
|
||||
r'(?<=[0-9])[+\-\*/^](?=[0-9])',
|
||||
r'(?<=[{a}])--(?=[{a}])',
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r'(?<=[0-9{a}])"(?=[\-{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
|
||||
]
|
||||
|
||||
|
||||
TOKENIZER_INFIXES += LIST_ELLIPSES
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
|
@ -1,89 +0,0 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
TOKENIZER_PREFIXES = r'''
|
||||
+
|
||||
'''.strip().split('\n')
|
||||
|
||||
TOKENIZER_SUFFIXES = r'''
|
||||
,
|
||||
\"
|
||||
\)
|
||||
\]
|
||||
\}
|
||||
\*
|
||||
\!
|
||||
\?
|
||||
\$
|
||||
>
|
||||
:
|
||||
;
|
||||
'
|
||||
”
|
||||
“
|
||||
«
|
||||
_
|
||||
''
|
||||
’
|
||||
‘
|
||||
€
|
||||
\.\.
|
||||
\.\.\.
|
||||
\.\.\.\.
|
||||
(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”+-])\.
|
||||
(?<=[a-züóőúéáűí)])-e
|
||||
\-\-
|
||||
´
|
||||
(?<=[0-9])\+
|
||||
(?<=[a-z0-9üóőúéáűí][\)\]”"'%\)§/])\.
|
||||
(?<=[0-9])km²
|
||||
(?<=[0-9])m²
|
||||
(?<=[0-9])cm²
|
||||
(?<=[0-9])mm²
|
||||
(?<=[0-9])km³
|
||||
(?<=[0-9])m³
|
||||
(?<=[0-9])cm³
|
||||
(?<=[0-9])mm³
|
||||
(?<=[0-9])ha
|
||||
(?<=[0-9])km
|
||||
(?<=[0-9])m
|
||||
(?<=[0-9])cm
|
||||
(?<=[0-9])mm
|
||||
(?<=[0-9])µm
|
||||
(?<=[0-9])nm
|
||||
(?<=[0-9])yd
|
||||
(?<=[0-9])in
|
||||
(?<=[0-9])ft
|
||||
(?<=[0-9])kg
|
||||
(?<=[0-9])g
|
||||
(?<=[0-9])mg
|
||||
(?<=[0-9])µg
|
||||
(?<=[0-9])t
|
||||
(?<=[0-9])lb
|
||||
(?<=[0-9])oz
|
||||
(?<=[0-9])m/s
|
||||
(?<=[0-9])km/h
|
||||
(?<=[0-9])mph
|
||||
(?<=°[FCK])\.
|
||||
(?<=[0-9])hPa
|
||||
(?<=[0-9])Pa
|
||||
(?<=[0-9])mbar
|
||||
(?<=[0-9])mb
|
||||
(?<=[0-9])T
|
||||
(?<=[0-9])G
|
||||
(?<=[0-9])M
|
||||
(?<=[0-9])K
|
||||
(?<=[0-9])kb
|
||||
'''.strip().split('\n')
|
||||
|
||||
TOKENIZER_INFIXES = r'''
|
||||
…
|
||||
\.\.+
|
||||
(?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ])
|
||||
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
|
||||
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
|
||||
(?<=[0-9])[+\-\*/^](?=[0-9])
|
||||
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
|
||||
'''.strip().split('\n')
|
||||
|
||||
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
|
@ -111,7 +111,6 @@ Vcs.
|
|||
Vhr.
|
||||
X.Y.
|
||||
Zs.
|
||||
a.
|
||||
a.C.
|
||||
ac.
|
||||
adj.
|
||||
|
@ -126,7 +125,6 @@ ang.
|
|||
arch.
|
||||
at.
|
||||
aug.
|
||||
b.
|
||||
b.a.
|
||||
b.s.
|
||||
b.sc.
|
||||
|
@ -141,7 +139,6 @@ br.
|
|||
bsc.
|
||||
bt.
|
||||
btk.
|
||||
c.
|
||||
ca.
|
||||
cc.
|
||||
cca.
|
||||
|
@ -155,7 +152,6 @@ csc.
|
|||
csüt.
|
||||
cső.
|
||||
ctv.
|
||||
d.
|
||||
dbj.
|
||||
dd.
|
||||
ddr.
|
||||
|
@ -170,7 +166,6 @@ dolg.
|
|||
dr.
|
||||
du.
|
||||
dzs.
|
||||
e.
|
||||
ea.
|
||||
ed.
|
||||
eff.
|
||||
|
@ -186,7 +181,6 @@ etc.
|
|||
ev.
|
||||
ezr.
|
||||
eü.
|
||||
f.
|
||||
f.h.
|
||||
f.é.
|
||||
fam.
|
||||
|
@ -213,7 +207,6 @@ főig.
|
|||
főisk.
|
||||
főtörm.
|
||||
főv.
|
||||
g.
|
||||
gazd.
|
||||
gimn.
|
||||
gk.
|
||||
|
@ -225,7 +218,6 @@ gy.
|
|||
gyak.
|
||||
gyártm.
|
||||
gör.
|
||||
h.
|
||||
hads.
|
||||
hallg.
|
||||
hdm.
|
||||
|
@ -266,7 +258,6 @@ isk.
|
|||
ism.
|
||||
izr.
|
||||
iá.
|
||||
j.
|
||||
jan.
|
||||
jav.
|
||||
jegyz.
|
||||
|
@ -278,7 +269,6 @@ jr.
|
|||
jvb.
|
||||
júl.
|
||||
jún.
|
||||
k.
|
||||
karb.
|
||||
kat.
|
||||
kb.
|
||||
|
@ -313,7 +303,6 @@ közl.
|
|||
közp.
|
||||
közt.
|
||||
kü.
|
||||
l.
|
||||
lat.
|
||||
ld.
|
||||
legs.
|
||||
|
@ -324,7 +313,6 @@ lt.
|
|||
ltd.
|
||||
ltp.
|
||||
luth.
|
||||
m.
|
||||
m.a.
|
||||
m.s.
|
||||
m.sc.
|
||||
|
@ -359,7 +347,6 @@ műh.
|
|||
műsz.
|
||||
műv.
|
||||
művez.
|
||||
n.
|
||||
nagyker.
|
||||
nagys.
|
||||
nat.
|
||||
|
@ -372,7 +359,6 @@ ny.
|
|||
nyilv.
|
||||
nyrt.
|
||||
nyug.
|
||||
o.
|
||||
obj.
|
||||
okl.
|
||||
okt.
|
||||
|
@ -381,7 +367,6 @@ orsz.
|
|||
ort.
|
||||
ov.
|
||||
ovh.
|
||||
p.
|
||||
pf.
|
||||
pg.
|
||||
ph.d
|
||||
|
@ -404,8 +389,6 @@ pság.
|
|||
ptk.
|
||||
pu.
|
||||
pü.
|
||||
q.
|
||||
r.
|
||||
r.k.
|
||||
rac.
|
||||
rad.
|
||||
|
@ -420,7 +403,6 @@ rkt.
|
|||
rt.
|
||||
rtg.
|
||||
röv.
|
||||
s.
|
||||
s.b.
|
||||
s.k.
|
||||
sa.
|
||||
|
@ -450,7 +432,6 @@ szt.
|
|||
szubj.
|
||||
szöv.
|
||||
szül.
|
||||
t.
|
||||
tanm.
|
||||
tb.
|
||||
tbk.
|
||||
|
@ -476,13 +457,11 @@ tvr.
|
|||
ty.
|
||||
törv.
|
||||
tü.
|
||||
u.
|
||||
ua.
|
||||
ui.
|
||||
unit.
|
||||
uo.
|
||||
uv.
|
||||
v.
|
||||
vas.
|
||||
vb.
|
||||
vegy.
|
||||
|
@ -501,9 +480,6 @@ vv.
|
|||
vál.
|
||||
vízv.
|
||||
vö.
|
||||
w.
|
||||
y.
|
||||
z.
|
||||
zrt.
|
||||
zs.
|
||||
Ész.
|
||||
|
@ -520,7 +496,6 @@ zs.
|
|||
évf.
|
||||
í.
|
||||
ó.
|
||||
ö.
|
||||
össz.
|
||||
ötk.
|
||||
özv.
|
||||
|
@ -528,7 +503,6 @@ zs.
|
|||
úm.
|
||||
ún.
|
||||
út.
|
||||
ü.
|
||||
üag.
|
||||
üd.
|
||||
üdv.
|
||||
|
@ -544,6 +518,5 @@ zs.
|
|||
""".strip().split()
|
||||
|
||||
OTHER_EXC = """
|
||||
''
|
||||
-e
|
||||
""".strip().split()
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from os import path
|
||||
|
||||
from ..language import Language
|
||||
from ..attrs import LANG
|
||||
|
||||
|
|
|
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
|
|||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from .abbreviations import *
|
||||
from .emoticons import *
|
||||
from .punctuation import *
|
||||
from .tag_map import *
|
||||
|
|
43
spacy/language_data/abbreviations.py
Normal file
43
spacy/language_data/abbreviations.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
ABBREVIATIONS = [
|
||||
"'",
|
||||
"\\\")",
|
||||
"<space>",
|
||||
"''",
|
||||
"C++",
|
||||
"a.",
|
||||
"b.",
|
||||
"c.",
|
||||
"d.",
|
||||
"e.",
|
||||
"f.",
|
||||
"g.",
|
||||
"h.",
|
||||
"i.",
|
||||
"j.",
|
||||
"k.",
|
||||
"l.",
|
||||
"m.",
|
||||
"n.",
|
||||
"o.",
|
||||
"p.",
|
||||
"q.",
|
||||
"r.",
|
||||
"s.",
|
||||
"t.",
|
||||
"u.",
|
||||
"v.",
|
||||
"w.",
|
||||
"x.",
|
||||
"y.",
|
||||
"z.",
|
||||
"ä.",
|
||||
"ö.",
|
||||
"ü."
|
||||
]
|
||||
|
||||
|
||||
__all__ = [ "ABBREVIATIONS" ]
|
|
@ -13,6 +13,7 @@ EMOTICONS = set("""
|
|||
(-:
|
||||
=)
|
||||
(=
|
||||
")
|
||||
:]
|
||||
:-]
|
||||
[:
|
||||
|
|
|
@ -1,133 +1,115 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
TOKENIZER_PREFIXES = r'''
|
||||
,
|
||||
"
|
||||
(
|
||||
[
|
||||
{
|
||||
*
|
||||
<
|
||||
>
|
||||
$
|
||||
£
|
||||
¡
|
||||
¿
|
||||
„
|
||||
“
|
||||
'
|
||||
``
|
||||
`
|
||||
#
|
||||
‘
|
||||
....
|
||||
...
|
||||
…
|
||||
‚
|
||||
»
|
||||
§
|
||||
US$
|
||||
C$
|
||||
A$
|
||||
a-
|
||||
'''.strip().split('\n')
|
||||
import re
|
||||
|
||||
|
||||
TOKENIZER_SUFFIXES = r'''
|
||||
,
|
||||
\"
|
||||
\)
|
||||
\]
|
||||
\}
|
||||
\*
|
||||
\!
|
||||
\?
|
||||
%
|
||||
\$
|
||||
>
|
||||
:
|
||||
;
|
||||
'
|
||||
”
|
||||
“
|
||||
«
|
||||
_
|
||||
''
|
||||
's
|
||||
'S
|
||||
’s
|
||||
’S
|
||||
’
|
||||
‘
|
||||
°
|
||||
€
|
||||
…
|
||||
\.\.
|
||||
\.\.\.
|
||||
\.\.\.\.
|
||||
(?<=[a-z0-9)\]”"'%\)])\.
|
||||
(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
|
||||
\-\-
|
||||
´
|
||||
(?<=[0-9])km²
|
||||
(?<=[0-9])m²
|
||||
(?<=[0-9])cm²
|
||||
(?<=[0-9])mm²
|
||||
(?<=[0-9])km³
|
||||
(?<=[0-9])m³
|
||||
(?<=[0-9])cm³
|
||||
(?<=[0-9])mm³
|
||||
(?<=[0-9])ha
|
||||
(?<=[0-9])km
|
||||
(?<=[0-9])m
|
||||
(?<=[0-9])cm
|
||||
(?<=[0-9])mm
|
||||
(?<=[0-9])µm
|
||||
(?<=[0-9])nm
|
||||
(?<=[0-9])yd
|
||||
(?<=[0-9])in
|
||||
(?<=[0-9])ft
|
||||
(?<=[0-9])kg
|
||||
(?<=[0-9])g
|
||||
(?<=[0-9])mg
|
||||
(?<=[0-9])µg
|
||||
(?<=[0-9])t
|
||||
(?<=[0-9])lb
|
||||
(?<=[0-9])oz
|
||||
(?<=[0-9])m/s
|
||||
(?<=[0-9])km/h
|
||||
(?<=[0-9])mph
|
||||
(?<=[0-9])°C
|
||||
(?<=[0-9])°K
|
||||
(?<=[0-9])°F
|
||||
(?<=[0-9])hPa
|
||||
(?<=[0-9])Pa
|
||||
(?<=[0-9])mbar
|
||||
(?<=[0-9])mb
|
||||
(?<=[0-9])T
|
||||
(?<=[0-9])G
|
||||
(?<=[0-9])M
|
||||
(?<=[0-9])K
|
||||
(?<=[0-9])kb
|
||||
'''.strip().split('\n')
|
||||
_ALPHA_LOWER = """
|
||||
a ä à á â ǎ æ ã å ā ă ą b c ç ć č ĉ ċ c̄ d ð ď e é è ê ë ė ȅ ȩ ẽ ę f g ĝ ğ h i ı
|
||||
î ï í ī ì ȉ ǐ į ĩ j k ķ l ł ļ m n ñ ń ň ņ o ö ó ò ő ô õ œ ø ō ő ǒ ơ p q r ř ŗ s
|
||||
ß ś š ş ŝ t ť u ú û ù ú ū ű ǔ ů ų ư v w ŵ x y ÿ ý ỳ ŷ ỹ z ź ž ż þ
|
||||
"""
|
||||
|
||||
|
||||
TOKENIZER_INFIXES = r'''
|
||||
…
|
||||
\.\.\.+
|
||||
(?<=[a-z])\.(?=[A-Z])
|
||||
(?<=[a-z])\.(?=[A-Z])
|
||||
(?<=[a-zA-Z])-(?=[a-zA-z])
|
||||
(?<=[a-zA-Z])--(?=[a-zA-z])
|
||||
(?<=[0-9])-(?=[0-9])
|
||||
(?<=[A-Za-z]),(?=[A-Za-z])
|
||||
(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
|
||||
(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
|
||||
(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
|
||||
(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
|
||||
'''.strip().split('\n')
|
||||
_ALPHA_UPPER = """
|
||||
A Ä À Á Â Ǎ Æ Ã Å Ā Ă Ą B C Ç Ć Č Ĉ Ċ C̄ D Ð Ď E É È Ê Ë Ė Ȅ Ȩ Ẽ Ę F G Ĝ Ğ H I İ
|
||||
Î Ï Í Ī Ì Ȉ Ǐ Į Ĩ J K Ķ L Ł Ļ M N Ñ Ń Ň Ņ O Ö Ó Ò Ő Ô Õ Œ Ø Ō Ő Ǒ Ơ P Q R Ř Ŗ S
|
||||
Ś Š Ş Ŝ T Ť U Ú Û Ù Ú Ū Ű Ǔ Ů Ų Ư V W Ŵ X Y Ÿ Ý Ỳ Ŷ Ỹ Z Ź Ž Ż Þ
|
||||
"""
|
||||
|
||||
|
||||
_UNITS = """
|
||||
km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft kg g mg
|
||||
µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb
|
||||
TB T G M K
|
||||
"""
|
||||
|
||||
|
||||
_CURRENCY = r"""
|
||||
\$ £ € ¥ ฿ US\$ C\$ A\$
|
||||
"""
|
||||
|
||||
|
||||
_QUOTES = r"""
|
||||
' '' " ” “ `` ` ‘ ´ ‚ , „ » «
|
||||
"""
|
||||
|
||||
|
||||
_PUNCT = r"""
|
||||
… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &
|
||||
"""
|
||||
|
||||
|
||||
_HYPHENS = r"""
|
||||
- – — -- ---
|
||||
"""
|
||||
|
||||
|
||||
LIST_ELLIPSES = [
|
||||
r'\.\.+',
|
||||
"…"
|
||||
]
|
||||
|
||||
|
||||
LIST_CURRENCY = list(_CURRENCY.strip().split())
|
||||
LIST_QUOTES = list(_QUOTES.strip().split())
|
||||
LIST_PUNCT = list(_PUNCT.strip().split())
|
||||
LIST_HYPHENS = list(_HYPHENS.strip().split())
|
||||
|
||||
|
||||
ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '')
|
||||
ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '')
|
||||
ALPHA = ALPHA_LOWER + ALPHA_UPPER
|
||||
|
||||
|
||||
QUOTES = _QUOTES.strip().replace(' ', '|')
|
||||
CURRENCY = _CURRENCY.strip().replace(' ', '|')
|
||||
UNITS = _UNITS.strip().replace(' ', '|')
|
||||
HYPHENS = _HYPHENS.strip().replace(' ', '|')
|
||||
|
||||
|
||||
|
||||
# Prefixes
|
||||
|
||||
TOKENIZER_PREFIXES = (
|
||||
['§', '%', r'\+'] +
|
||||
LIST_PUNCT +
|
||||
LIST_ELLIPSES +
|
||||
LIST_QUOTES +
|
||||
LIST_CURRENCY
|
||||
)
|
||||
|
||||
|
||||
# Suffixes
|
||||
|
||||
TOKENIZER_SUFFIXES = (
|
||||
LIST_PUNCT +
|
||||
LIST_ELLIPSES +
|
||||
LIST_QUOTES +
|
||||
[
|
||||
r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
|
||||
r'(?<=[0-9])(?:{u})'.format(u=UNITS),
|
||||
r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES),
|
||||
r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER),
|
||||
"'s", "'S", "’s", "’S"
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# Infixes
|
||||
|
||||
TOKENIZER_INFIXES = (
|
||||
LIST_ELLIPSES +
|
||||
[
|
||||
r'(?<=[0-9])[+\-\*/^](?=[0-9])',
|
||||
r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
||||
|
|
|
@ -20,5 +20,6 @@ TAG_MAP = {
|
|||
"X": {POS: X},
|
||||
"CONJ": {POS: CONJ},
|
||||
"ADJ": {POS: ADJ},
|
||||
"VERB": {POS: VERB}
|
||||
"VERB": {POS: VERB},
|
||||
"PART": {POS: PART}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from os import path
|
||||
|
||||
from ..language import Language
|
||||
from ..attrs import LANG
|
||||
from .language_data import *
|
||||
|
|
|
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
|
|||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from os import path
|
||||
|
||||
from ..language import Language
|
||||
from ..attrs import LANG
|
||||
|
||||
|
|
|
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
|
|||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from os import path
|
||||
|
||||
from ..language import Language
|
||||
from ..attrs import LANG
|
||||
from .language_data import *
|
||||
|
|
|
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
|
|||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||
|
|
11
spacy/tests/de/conftest.py
Normal file
11
spacy/tests/de/conftest.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
from ...de import German
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def de_tokenizer():
|
||||
return German.Defaults.create_tokenizer()
|
0
spacy/tests/de/tokenizer/__init__.py
Normal file
0
spacy/tests/de/tokenizer/__init__.py
Normal file
27
spacy/tests/de/tokenizer/test_exceptions.py
Normal file
27
spacy/tests/de/tokenizer/test_exceptions.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer exceptions and emoticons are handles correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
|
||||
def test_tokenizer_splits_contractions(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
|
||||
def test_tokenizer_handles_abbr(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_tokenizer_handles_exc_in_text(de_tokenizer):
|
||||
text = "Ich bin z.Zt. im Urlaub."
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 6
|
||||
assert tokens[2].text == "z.Zt."
|
||||
assert tokens[2].lemma_ == "zur Zeit"
|
116
spacy/tests/de/tokenizer/test_prefix_suffix_infix.py
Normal file
116
spacy/tests/de/tokenizer/test_prefix_suffix_infix.py
Normal file
|
@ -0,0 +1,116 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(unter)"])
|
||||
def test_tokenizer_splits_no_special(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["unter'm"])
|
||||
def test_tokenizer_splits_no_punct(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(unter'm"])
|
||||
def test_tokenizer_splits_prefix_punct(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["unter'm)"])
|
||||
def test_tokenizer_splits_suffix_punct(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(unter'm)"])
|
||||
def test_tokenizer_splits_even_wrap(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(unter'm?)"])
|
||||
def test_tokenizer_splits_uneven_wrap(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
|
||||
def test_tokenizer_splits_prefix_interact(de_tokenizer, text, length):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["z.B.)"])
|
||||
def test_tokenizer_splits_suffix_interact(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(z.B.)"])
|
||||
def test_tokenizer_splits_even_wrap_interact(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(z.B.?)"])
|
||||
def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["blau-rot"])
|
||||
def test_tokenizer_splits_hyphens(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||
def test_tokenizer_splits_numeric_range(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["blau.Rot", "Hallo.Welt"])
|
||||
def test_tokenizer_splits_period_infix(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Hallo,Welt", "eins,zwei"])
|
||||
def test_tokenizer_splits_comma_infix(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == text.split(",")[0]
|
||||
assert tokens[1].text == ","
|
||||
assert tokens[2].text == text.split(",")[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["blau...Rot", "blau...rot"])
|
||||
def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_tokenizer_splits_double_hyphen_infix(de_tokenizer):
|
||||
tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.")
|
||||
assert len(tokens) == 12
|
||||
assert tokens[0].text == "Viele"
|
||||
assert tokens[1].text == "Regeln"
|
||||
assert tokens[2].text == "--"
|
||||
assert tokens[3].text == "wie"
|
||||
assert tokens[4].text == "die"
|
||||
assert tokens[5].text == "Bindestrich"
|
||||
assert tokens[6].text == "-"
|
||||
assert tokens[7].text == "Regeln"
|
||||
assert tokens[8].text == "--"
|
||||
assert tokens[9].text == "sind"
|
||||
assert tokens[10].text == "kompliziert"
|
45
spacy/tests/de/tokenizer/test_text.py
Normal file
45
spacy/tests/de/tokenizer/test_text.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
# coding: utf-8
|
||||
"""Test that longer and mixed texts are tokenized correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_tokenizer_handles_long_text(de_tokenizer):
|
||||
text = """Die Verwandlung
|
||||
|
||||
Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte, fand er sich in
|
||||
seinem Bett zu einem ungeheueren Ungeziefer verwandelt.
|
||||
|
||||
Er lag auf seinem panzerartig harten Rücken und sah, wenn er den Kopf ein wenig
|
||||
hob, seinen gewölbten, braunen, von bogenförmigen Versteifungen geteilten
|
||||
Bauch, auf dessen Höhe sich die Bettdecke, zum gänzlichen Niedergleiten bereit,
|
||||
kaum noch erhalten konnte. Seine vielen, im Vergleich zu seinem sonstigen
|
||||
Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
|
||||
|
||||
»Was ist mit mir geschehen?«, dachte er."""
|
||||
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 109
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [
|
||||
("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1),
|
||||
("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1),
|
||||
("Kraftfahrzeug-Haftpflichtversicherung", 3),
|
||||
("Vakuum-Mittelfrequenz-Induktionsofen", 5)
|
||||
])
|
||||
def test_tokenizer_handles_long_words(de_tokenizer, text, length):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [
|
||||
("»Was ist mit mir geschehen?«, dachte er.", 12),
|
||||
("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15)
|
||||
])
|
||||
def test_tokenizer_handles_examples(de_tokenizer, text, length):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == length
|
0
spacy/tests/en/__init__.py
Normal file
0
spacy/tests/en/__init__.py
Normal file
11
spacy/tests/en/conftest.py
Normal file
11
spacy/tests/en/conftest.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
from ...en import English
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def en_tokenizer():
|
||||
return English.Defaults.create_tokenizer()
|
0
spacy/tests/en/tokenizer/__init__.py
Normal file
0
spacy/tests/en/tokenizer/__init__.py
Normal file
87
spacy/tests/en/tokenizer/test_contractions.py
Normal file
87
spacy/tests/en/tokenizer/test_contractions.py
Normal file
|
@ -0,0 +1,87 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokens are created correctly for contractions."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_tokenizer_handles_basic_contraction(en_tokenizer):
|
||||
text = "don't giggle"
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].text == "n't"
|
||||
text = "i said don't!"
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[4].text == "!"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
|
||||
def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
|
||||
def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
||||
tokens = en_tokenizer(text_poss)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == "'s"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
|
||||
def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text.split("'")[0]
|
||||
assert tokens[1].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
|
||||
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
|
||||
def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text.split("'")[0]
|
||||
assert tokens[1].text == "'ll"
|
||||
assert tokens[1].lemma_ == "will"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
|
||||
def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
|
||||
tokens_lower = en_tokenizer(text_lower)
|
||||
tokens_title = en_tokenizer(text_title)
|
||||
assert tokens_title[0].text == tokens_lower[0].text.title()
|
||||
assert tokens_lower[0].text == tokens_title[0].text.lower()
|
||||
assert tokens_lower[1].text == tokens_title[1].text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
|
||||
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
|
||||
def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
|
||||
tokens = en_tokenizer(pron + contraction)
|
||||
assert tokens[0].text == pron
|
||||
assert tokens[1].text == contraction
|
||||
|
||||
|
||||
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
|
||||
def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
|
||||
tokens = en_tokenizer(exc)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
|
||||
def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
||||
tokens = en_tokenizer(wo_punct)
|
||||
assert len(tokens) == 2
|
||||
tokens = en_tokenizer(w_punct)
|
||||
assert len(tokens) == 3
|
20
spacy/tests/en/tokenizer/test_exceptions.py
Normal file
20
spacy/tests/en/tokenizer/test_exceptions.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer exceptions are handled correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
|
||||
def test_tokenizer_handles_abbr(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_tokenizer_handles_exc_in_text(en_tokenizer):
|
||||
text = "It's mediocre i.e. bad."
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 6
|
||||
assert tokens[3].text == "i.e."
|
|
@ -1,12 +1,14 @@
|
|||
# coding: utf-8
|
||||
"""Test that token.idx correctly computes index into the original string."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_simple_punct(en_tokenizer):
|
||||
text = 'to walk, do foo'
|
||||
text = "to walk, do foo"
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].idx == 0
|
||||
assert tokens[1].idx == 3
|
||||
|
@ -16,7 +18,7 @@ def test_simple_punct(en_tokenizer):
|
|||
|
||||
|
||||
def test_complex_punct(en_tokenizer):
|
||||
text = 'Tom (D., Ill.)!'
|
||||
text = "Tom (D., Ill.)!"
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].idx == 0
|
||||
assert len(tokens[0]) == 3
|
136
spacy/tests/en/tokenizer/test_prefix_suffix_infix.py
Normal file
136
spacy/tests/en/tokenizer/test_prefix_suffix_infix.py
Normal file
|
@ -0,0 +1,136 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(can)"])
|
||||
def test_tokenizer_splits_no_special(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["can't"])
|
||||
def test_tokenizer_splits_no_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(can't"])
|
||||
def test_tokenizer_splits_prefix_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["can't)"])
|
||||
def test_tokenizer_splits_suffix_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(can't)"])
|
||||
def test_tokenizer_splits_even_wrap(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(can't?)"])
|
||||
def test_tokenizer_splits_uneven_wrap(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
|
||||
def test_tokenizer_splits_prefix_interact(en_tokenizer, text, length):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["U.S.)"])
|
||||
def test_tokenizer_splits_suffix_interact(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(U.S.)"])
|
||||
def test_tokenizer_splits_even_wrap_interact(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(U.S.?)"])
|
||||
def test_tokenizer_splits_uneven_wrap_interact(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["best-known"])
|
||||
def test_tokenizer_splits_hyphens(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||
def test_tokenizer_splits_numeric_range(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["best.Known", "Hello.World"])
|
||||
def test_tokenizer_splits_period_infix(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Hello,world", "one,two"])
|
||||
def test_tokenizer_splits_comma_infix(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == text.split(",")[0]
|
||||
assert tokens[1].text == ","
|
||||
assert tokens[2].text == text.split(",")[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["best...Known", "best...known"])
|
||||
def test_tokenizer_splits_ellipsis_infix(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_tokenizer_splits_double_hyphen_infix(en_tokenizer):
|
||||
tokens = en_tokenizer("No decent--let alone well-bred--people.")
|
||||
assert tokens[0].text == "No"
|
||||
assert tokens[1].text == "decent"
|
||||
assert tokens[2].text == "--"
|
||||
assert tokens[3].text == "let"
|
||||
assert tokens[4].text == "alone"
|
||||
assert tokens[5].text == "well"
|
||||
assert tokens[6].text == "-"
|
||||
assert tokens[7].text == "bred"
|
||||
assert tokens[8].text == "--"
|
||||
assert tokens[9].text == "people"
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_tokenizer_splits_period_abbr(en_tokenizer):
|
||||
text = "Today is Tuesday.Mr."
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].text == "Today"
|
||||
assert tokens[1].text == "is"
|
||||
assert tokens[2].text == "Tuesday"
|
||||
assert tokens[3].text == "."
|
||||
assert tokens[4].text == "Mr."
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_tokenizer_splits_em_dash_infix(en_tokenizer):
|
||||
# Re Issue #225
|
||||
tokens = en_tokenizer("""Will this road take me to Puddleton?\u2014No, """
|
||||
"""you'll have to walk there.\u2014Ariel.""")
|
||||
assert tokens[6].text == "Puddleton"
|
||||
assert tokens[7].text == "?"
|
||||
assert tokens[8].text == "\u2014"
|
132
spacy/tests/en/tokenizer/test_punct.py
Normal file
132
spacy/tests/en/tokenizer/test_punct.py
Normal file
|
@ -0,0 +1,132 @@
|
|||
# coding: utf-8
|
||||
"""Test that open, closed and paired punctuation is split off correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
from ....util import compile_prefix_regex
|
||||
from ....language_data import TOKENIZER_PREFIXES
|
||||
|
||||
|
||||
|
||||
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||
|
||||
PUNCT_OPEN = ['(', '[', '{', '*']
|
||||
PUNCT_CLOSE = [')', ']', '}', '*']
|
||||
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
||||
def test_tokenizer_handles_only_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == len(text)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_open_punct(en_tokenizer, punct, text):
|
||||
tokens = en_tokenizer(punct + text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == punct
|
||||
assert tokens[1].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_close_punct(en_tokenizer, punct, text):
|
||||
tokens = en_tokenizer(text + punct)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == punct
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('punct_add', ["`"])
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
|
||||
tokens = en_tokenizer(punct + punct_add + text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == punct
|
||||
assert tokens[1].text == punct_add
|
||||
assert tokens[2].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('punct_add', ["'"])
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
|
||||
tokens = en_tokenizer(text + punct + punct_add)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == punct
|
||||
assert tokens[2].text == punct_add
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
|
||||
tokens = en_tokenizer(punct + punct + punct + text)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].text == punct
|
||||
assert tokens[3].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
|
||||
tokens = en_tokenizer(text + punct + punct + punct)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == punct
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["'The"])
|
||||
def test_tokenizer_splits_open_appostrophe(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Hello''"])
|
||||
def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
tokens_punct = en_tokenizer("''")
|
||||
assert len(tokens_punct) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text):
|
||||
tokens = en_tokenizer(punct_open + text + punct_close)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == punct_open
|
||||
assert tokens[1].text == text
|
||||
assert tokens[2].text == punct_close
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")])
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text):
|
||||
tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].text == punct_open_add
|
||||
assert tokens[1].text == punct_open
|
||||
assert tokens[2].text == text
|
||||
assert tokens[3].text == punct_close
|
||||
assert tokens[4].text == punct_close_add
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
||||
def test_tokenizer_splits_pre_punct_regex(text, punct):
|
||||
match = en_search_prefixes(text)
|
||||
assert match.group() == punct
|
||||
|
||||
|
||||
def test_tokenizer_splits_bracket_period(en_tokenizer):
|
||||
text = "(And a 6a.m. run through Washington Park)."
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[len(tokens) - 1].text == "."
|
36
spacy/tests/en/tokenizer/test_text.py
Normal file
36
spacy/tests/en/tokenizer/test_text.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# coding: utf-8
|
||||
"""Test that longer and mixed texts are tokenized correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_tokenizer_handles_long_text(en_tokenizer):
|
||||
text = """Tributes pour in for late British Labour Party leader
|
||||
|
||||
Tributes poured in from around the world Thursday
|
||||
to the late Labour Party leader John Smith, who died earlier from a massive
|
||||
heart attack aged 55.
|
||||
|
||||
In Washington, the US State Department issued a statement regretting "the
|
||||
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
||||
|
||||
"Mr. Smith, throughout his distinguished"""
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 76
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [
|
||||
("The U.S. Army likes Shock and Awe.", 8),
|
||||
("U.N. regulations are not a part of their concern.", 10),
|
||||
("“Isn't it?”", 6),
|
||||
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
|
||||
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
||||
("They ran about 10km.", 6),
|
||||
# ("But then the 6,000-year ice age came...", 10)
|
||||
])
|
||||
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == length
|
11
spacy/tests/hu/conftest.py
Normal file
11
spacy/tests/hu/conftest.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
from ...hu import Hungarian
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def hu_tokenizer():
|
||||
return Hungarian.Defaults.create_tokenizer()
|
|
@ -2,25 +2,27 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.hu import Hungarian
|
||||
|
||||
_DEFAULT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
||||
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
||||
('A .hu.', ['A', '.hu', '.']),
|
||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||
('A pl.', ['A', 'pl.']),
|
||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||
('Egy..ket.', ['Egy', '..', 'ket', '.']),
|
||||
('Valami... van.', ['Valami', '...', 'van', '.']),
|
||||
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
||||
('Valami...', ['Valami', '...']),
|
||||
('Valami ...', ['Valami', '...']),
|
||||
('Valami ... más.', ['Valami', '...', 'más', '.'])]
|
||||
|
||||
_HYPHEN_TESTS = [
|
||||
DEFAULT_TESTS = [
|
||||
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
||||
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
||||
('A .hu.', ['A', '.hu', '.']),
|
||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||
('A pl.', ['A', 'pl.']),
|
||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||
('Egy..ket.', ['Egy', '..', 'ket', '.']),
|
||||
('Valami... van.', ['Valami', '...', 'van', '.']),
|
||||
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
||||
('Valami...', ['Valami', '...']),
|
||||
('Valami ...', ['Valami', '...']),
|
||||
('Valami ... más.', ['Valami', '...', 'más', '.'])
|
||||
]
|
||||
|
||||
HYPHEN_TESTS = [
|
||||
('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']),
|
||||
('Egy -nak.', ['Egy', '-nak', '.']),
|
||||
('Egy bel-.', ['Egy', 'bel-', '.']),
|
||||
|
@ -39,195 +41,194 @@ _HYPHEN_TESTS = [
|
|||
('A 7-es.', ['A', '7-es', '.']),
|
||||
('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']),
|
||||
('A %-sal.', ['A', '%-sal', '.']),
|
||||
('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])]
|
||||
('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])
|
||||
]
|
||||
|
||||
_NUMBER_TESTS = [('A 2b van.', ['A', '2b', 'van', '.']),
|
||||
('A 2b-ben van.', ['A', '2b-ben', 'van', '.']),
|
||||
('A 2b.', ['A', '2b', '.']),
|
||||
('A 2b-ben.', ['A', '2b-ben', '.']),
|
||||
('A 3.b van.', ['A', '3.b', 'van', '.']),
|
||||
('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']),
|
||||
('A 3.b.', ['A', '3.b', '.']),
|
||||
('A 3.b-ben.', ['A', '3.b-ben', '.']),
|
||||
('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']),
|
||||
('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']),
|
||||
('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']),
|
||||
('A 1:35 van.', ['A', '1:35', 'van', '.']),
|
||||
('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']),
|
||||
('A 1:35-ben.', ['A', '1:35-ben', '.']),
|
||||
('A 1.35 van.', ['A', '1.35', 'van', '.']),
|
||||
('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']),
|
||||
('A 1.35-ben.', ['A', '1.35-ben', '.']),
|
||||
('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']),
|
||||
('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']),
|
||||
('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']),
|
||||
('A 10--12 van.', ['A', '10--12', 'van', '.']),
|
||||
('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']),
|
||||
('A 10--12-ben.', ['A', '10--12-ben', '.']),
|
||||
('A 10‐12 van.', ['A', '10‐12', 'van', '.']),
|
||||
('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']),
|
||||
('A 10‐12-ben.', ['A', '10‐12-ben', '.']),
|
||||
('A 10‑12 van.', ['A', '10‑12', 'van', '.']),
|
||||
('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']),
|
||||
('A 10‑12-ben.', ['A', '10‑12-ben', '.']),
|
||||
('A 10‒12 van.', ['A', '10‒12', 'van', '.']),
|
||||
('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']),
|
||||
('A 10‒12-ben.', ['A', '10‒12-ben', '.']),
|
||||
('A 10–12 van.', ['A', '10–12', 'van', '.']),
|
||||
('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']),
|
||||
('A 10–12-ben.', ['A', '10–12-ben', '.']),
|
||||
('A 10—12 van.', ['A', '10—12', 'van', '.']),
|
||||
('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']),
|
||||
('A 10—12-ben.', ['A', '10—12-ben', '.']),
|
||||
('A 10―12 van.', ['A', '10―12', 'van', '.']),
|
||||
('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']),
|
||||
('A 10―12-ben.', ['A', '10―12-ben', '.']),
|
||||
('A -23,12 van.', ['A', '-23,12', 'van', '.']),
|
||||
('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']),
|
||||
('A -23,12-ben.', ['A', '-23,12-ben', '.']),
|
||||
('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A C++ van.', ['A', 'C++', 'van', '.']),
|
||||
('A C++-ben van.', ['A', 'C++-ben', 'van', '.']),
|
||||
('A C++.', ['A', 'C++', '.']),
|
||||
('A C++-ben.', ['A', 'C++-ben', '.']),
|
||||
('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']),
|
||||
('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']),
|
||||
('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']),
|
||||
('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']),
|
||||
('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']),
|
||||
('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']),
|
||||
('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']),
|
||||
('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']),
|
||||
('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']),
|
||||
('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']),
|
||||
('A IV. 12.', ['A', 'IV.', '12.']),
|
||||
('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']),
|
||||
('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']),
|
||||
('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']),
|
||||
('A 2003.01.06.', ['A', '2003.01.06.']),
|
||||
('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']),
|
||||
('A IV.12. van.', ['A', 'IV.12.', 'van', '.']),
|
||||
('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']),
|
||||
('A IV.12.', ['A', 'IV.12.']),
|
||||
('A IV.12-ben.', ['A', 'IV.12-ben', '.']),
|
||||
('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']),
|
||||
('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']),
|
||||
('A 1.1.2.', ['A', '1.1.2.']),
|
||||
('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']),
|
||||
('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']),
|
||||
('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']),
|
||||
('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']),
|
||||
('A 3,14 van.', ['A', '3,14', 'van', '.']),
|
||||
('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']),
|
||||
('A 3,14-ben.', ['A', '3,14-ben', '.']),
|
||||
('A 3.14 van.', ['A', '3.14', 'van', '.']),
|
||||
('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']),
|
||||
('A 3.14-ben.', ['A', '3.14-ben', '.']),
|
||||
('A 15. van.', ['A', '15.', 'van', '.']),
|
||||
('A 15-ben van.', ['A', '15-ben', 'van', '.']),
|
||||
('A 15-ben.', ['A', '15-ben', '.']),
|
||||
('A 15.-ben van.', ['A', '15.-ben', 'van', '.']),
|
||||
('A 15.-ben.', ['A', '15.-ben', '.']),
|
||||
('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']),
|
||||
('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']),
|
||||
('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']),
|
||||
('A -0,99% van.', ['A', '-0,99%', 'van', '.']),
|
||||
('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']),
|
||||
('A -0,99%.', ['A', '-0,99%', '.']),
|
||||
('A -0,99%-ben.', ['A', '-0,99%-ben', '.']),
|
||||
('A 10--20% van.', ['A', '10--20%', 'van', '.']),
|
||||
('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']),
|
||||
('A 10--20%.', ['A', '10--20%', '.']),
|
||||
('A 10--20%-ben.', ['A', '10--20%-ben', '.']),
|
||||
('A 99§ van.', ['A', '99§', 'van', '.']),
|
||||
('A 99§-ben van.', ['A', '99§-ben', 'van', '.']),
|
||||
('A 99§-ben.', ['A', '99§-ben', '.']),
|
||||
('A 10--20§ van.', ['A', '10--20§', 'van', '.']),
|
||||
('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']),
|
||||
('A 10--20§-ben.', ['A', '10--20§-ben', '.']),
|
||||
('A 99° van.', ['A', '99°', 'van', '.']),
|
||||
('A 99°-ben van.', ['A', '99°-ben', 'van', '.']),
|
||||
('A 99°-ben.', ['A', '99°-ben', '.']),
|
||||
('A 10--20° van.', ['A', '10--20°', 'van', '.']),
|
||||
('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']),
|
||||
('A 10--20°-ben.', ['A', '10--20°-ben', '.']),
|
||||
('A °C van.', ['A', '°C', 'van', '.']),
|
||||
('A °C-ben van.', ['A', '°C-ben', 'van', '.']),
|
||||
('A °C.', ['A', '°C', '.']),
|
||||
('A °C-ben.', ['A', '°C-ben', '.']),
|
||||
('A 100°C van.', ['A', '100°C', 'van', '.']),
|
||||
('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']),
|
||||
('A 100°C.', ['A', '100°C', '.']),
|
||||
('A 100°C-ben.', ['A', '100°C-ben', '.']),
|
||||
('A 800x600 van.', ['A', '800x600', 'van', '.']),
|
||||
('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']),
|
||||
('A 800x600-ben.', ['A', '800x600-ben', '.']),
|
||||
('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']),
|
||||
('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']),
|
||||
('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']),
|
||||
('A 5/J van.', ['A', '5/J', 'van', '.']),
|
||||
('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']),
|
||||
('A 5/J-ben.', ['A', '5/J-ben', '.']),
|
||||
('A 5/J. van.', ['A', '5/J.', 'van', '.']),
|
||||
('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']),
|
||||
('A 5/J.-ben.', ['A', '5/J.-ben', '.']),
|
||||
('A III/1 van.', ['A', 'III/1', 'van', '.']),
|
||||
('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']),
|
||||
('A III/1-ben.', ['A', 'III/1-ben', '.']),
|
||||
('A III/1. van.', ['A', 'III/1.', 'van', '.']),
|
||||
('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']),
|
||||
('A III/1.-ben.', ['A', 'III/1.-ben', '.']),
|
||||
('A III/c van.', ['A', 'III/c', 'van', '.']),
|
||||
('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']),
|
||||
('A III/c.', ['A', 'III/c', '.']),
|
||||
('A III/c-ben.', ['A', 'III/c-ben', '.']),
|
||||
('A TU–154 van.', ['A', 'TU–154', 'van', '.']),
|
||||
('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']),
|
||||
('A TU–154-ben.', ['A', 'TU–154-ben', '.'])]
|
||||
NUMBER_TESTS = [
|
||||
('A 2b van.', ['A', '2b', 'van', '.']),
|
||||
('A 2b-ben van.', ['A', '2b-ben', 'van', '.']),
|
||||
('A 2b.', ['A', '2b', '.']),
|
||||
('A 2b-ben.', ['A', '2b-ben', '.']),
|
||||
('A 3.b van.', ['A', '3.b', 'van', '.']),
|
||||
('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']),
|
||||
('A 3.b.', ['A', '3.b', '.']),
|
||||
('A 3.b-ben.', ['A', '3.b-ben', '.']),
|
||||
('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']),
|
||||
('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']),
|
||||
('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']),
|
||||
('A 1:35 van.', ['A', '1:35', 'van', '.']),
|
||||
('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']),
|
||||
('A 1:35-ben.', ['A', '1:35-ben', '.']),
|
||||
('A 1.35 van.', ['A', '1.35', 'van', '.']),
|
||||
('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']),
|
||||
('A 1.35-ben.', ['A', '1.35-ben', '.']),
|
||||
('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']),
|
||||
('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']),
|
||||
('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']),
|
||||
('A 10--12 van.', ['A', '10--12', 'van', '.']),
|
||||
('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']),
|
||||
('A 10--12-ben.', ['A', '10--12-ben', '.']),
|
||||
('A 10‐12 van.', ['A', '10‐12', 'van', '.']),
|
||||
('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']),
|
||||
('A 10‐12-ben.', ['A', '10‐12-ben', '.']),
|
||||
('A 10‑12 van.', ['A', '10‑12', 'van', '.']),
|
||||
('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']),
|
||||
('A 10‑12-ben.', ['A', '10‑12-ben', '.']),
|
||||
('A 10‒12 van.', ['A', '10‒12', 'van', '.']),
|
||||
('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']),
|
||||
('A 10‒12-ben.', ['A', '10‒12-ben', '.']),
|
||||
('A 10–12 van.', ['A', '10–12', 'van', '.']),
|
||||
('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']),
|
||||
('A 10–12-ben.', ['A', '10–12-ben', '.']),
|
||||
('A 10—12 van.', ['A', '10—12', 'van', '.']),
|
||||
('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']),
|
||||
('A 10—12-ben.', ['A', '10—12-ben', '.']),
|
||||
('A 10―12 van.', ['A', '10―12', 'van', '.']),
|
||||
('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']),
|
||||
('A 10―12-ben.', ['A', '10―12-ben', '.']),
|
||||
('A -23,12 van.', ['A', '-23,12', 'van', '.']),
|
||||
('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']),
|
||||
('A -23,12-ben.', ['A', '-23,12-ben', '.']),
|
||||
('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A C++ van.', ['A', 'C++', 'van', '.']),
|
||||
('A C++-ben van.', ['A', 'C++-ben', 'van', '.']),
|
||||
('A C++.', ['A', 'C++', '.']),
|
||||
('A C++-ben.', ['A', 'C++-ben', '.']),
|
||||
('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']),
|
||||
('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']),
|
||||
('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']),
|
||||
('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']),
|
||||
('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']),
|
||||
('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']),
|
||||
('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']),
|
||||
('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']),
|
||||
('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']),
|
||||
('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']),
|
||||
('A IV. 12.', ['A', 'IV.', '12.']),
|
||||
('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']),
|
||||
('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']),
|
||||
('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']),
|
||||
('A 2003.01.06.', ['A', '2003.01.06.']),
|
||||
('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']),
|
||||
('A IV.12. van.', ['A', 'IV.12.', 'van', '.']),
|
||||
('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']),
|
||||
('A IV.12.', ['A', 'IV.12.']),
|
||||
('A IV.12-ben.', ['A', 'IV.12-ben', '.']),
|
||||
('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']),
|
||||
('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']),
|
||||
('A 1.1.2.', ['A', '1.1.2.']),
|
||||
('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']),
|
||||
('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']),
|
||||
('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']),
|
||||
('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']),
|
||||
('A 3,14 van.', ['A', '3,14', 'van', '.']),
|
||||
('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']),
|
||||
('A 3,14-ben.', ['A', '3,14-ben', '.']),
|
||||
('A 3.14 van.', ['A', '3.14', 'van', '.']),
|
||||
('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']),
|
||||
('A 3.14-ben.', ['A', '3.14-ben', '.']),
|
||||
('A 15. van.', ['A', '15.', 'van', '.']),
|
||||
('A 15-ben van.', ['A', '15-ben', 'van', '.']),
|
||||
('A 15-ben.', ['A', '15-ben', '.']),
|
||||
('A 15.-ben van.', ['A', '15.-ben', 'van', '.']),
|
||||
('A 15.-ben.', ['A', '15.-ben', '.']),
|
||||
('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']),
|
||||
('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']),
|
||||
('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']),
|
||||
('A -0,99% van.', ['A', '-0,99%', 'van', '.']),
|
||||
('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']),
|
||||
('A -0,99%.', ['A', '-0,99%', '.']),
|
||||
('A -0,99%-ben.', ['A', '-0,99%-ben', '.']),
|
||||
('A 10--20% van.', ['A', '10--20%', 'van', '.']),
|
||||
('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']),
|
||||
('A 10--20%.', ['A', '10--20%', '.']),
|
||||
('A 10--20%-ben.', ['A', '10--20%-ben', '.']),
|
||||
('A 99§ van.', ['A', '99§', 'van', '.']),
|
||||
('A 99§-ben van.', ['A', '99§-ben', 'van', '.']),
|
||||
('A 99§-ben.', ['A', '99§-ben', '.']),
|
||||
('A 10--20§ van.', ['A', '10--20§', 'van', '.']),
|
||||
('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']),
|
||||
('A 10--20§-ben.', ['A', '10--20§-ben', '.']),
|
||||
('A 99° van.', ['A', '99°', 'van', '.']),
|
||||
('A 99°-ben van.', ['A', '99°-ben', 'van', '.']),
|
||||
('A 99°-ben.', ['A', '99°-ben', '.']),
|
||||
('A 10--20° van.', ['A', '10--20°', 'van', '.']),
|
||||
('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']),
|
||||
('A 10--20°-ben.', ['A', '10--20°-ben', '.']),
|
||||
('A °C van.', ['A', '°C', 'van', '.']),
|
||||
('A °C-ben van.', ['A', '°C-ben', 'van', '.']),
|
||||
('A °C.', ['A', '°C', '.']),
|
||||
('A °C-ben.', ['A', '°C-ben', '.']),
|
||||
('A 100°C van.', ['A', '100°C', 'van', '.']),
|
||||
('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']),
|
||||
('A 100°C.', ['A', '100°C', '.']),
|
||||
('A 100°C-ben.', ['A', '100°C-ben', '.']),
|
||||
('A 800x600 van.', ['A', '800x600', 'van', '.']),
|
||||
('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']),
|
||||
('A 800x600-ben.', ['A', '800x600-ben', '.']),
|
||||
('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']),
|
||||
('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']),
|
||||
('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']),
|
||||
('A 5/J van.', ['A', '5/J', 'van', '.']),
|
||||
('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']),
|
||||
('A 5/J-ben.', ['A', '5/J-ben', '.']),
|
||||
('A 5/J. van.', ['A', '5/J.', 'van', '.']),
|
||||
('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']),
|
||||
('A 5/J.-ben.', ['A', '5/J.-ben', '.']),
|
||||
('A III/1 van.', ['A', 'III/1', 'van', '.']),
|
||||
('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']),
|
||||
('A III/1-ben.', ['A', 'III/1-ben', '.']),
|
||||
('A III/1. van.', ['A', 'III/1.', 'van', '.']),
|
||||
('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']),
|
||||
('A III/1.-ben.', ['A', 'III/1.-ben', '.']),
|
||||
('A III/c van.', ['A', 'III/c', 'van', '.']),
|
||||
('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']),
|
||||
('A III/c.', ['A', 'III/c', '.']),
|
||||
('A III/c-ben.', ['A', 'III/c-ben', '.']),
|
||||
('A TU–154 van.', ['A', 'TU–154', 'van', '.']),
|
||||
('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']),
|
||||
('A TU–154-ben.', ['A', 'TU–154-ben', '.'])
|
||||
]
|
||||
|
||||
_QUOTE_TESTS = [('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
|
||||
('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
|
||||
('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']),
|
||||
('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']),
|
||||
("A don't van.", ['A', "don't", 'van', '.'])]
|
||||
QUOTE_TESTS = [
|
||||
('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
|
||||
('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
|
||||
('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']),
|
||||
('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']),
|
||||
("A don't van.", ['A', "don't", 'van', '.'])
|
||||
]
|
||||
|
||||
_DOT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
||||
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
||||
('A .hu.', ['A', '.hu', '.']),
|
||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||
('A pl.', ['A', 'pl.']),
|
||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||
('Egy..ket.', ['Egy', '..', 'ket', '.']),
|
||||
('Valami... van.', ['Valami', '...', 'van', '.']),
|
||||
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
||||
('Valami...', ['Valami', '...']),
|
||||
('Valami ...', ['Valami', '...']),
|
||||
('Valami ... más.', ['Valami', '...', 'más', '.'])]
|
||||
DOT_TESTS = [
|
||||
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
||||
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
||||
('A .hu.', ['A', '.hu', '.']),
|
||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||
('A pl.', ['A', 'pl.']),
|
||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||
('Egy..ket.', ['Egy', '..', 'ket', '.']),
|
||||
('Valami... van.', ['Valami', '...', 'van', '.']),
|
||||
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
||||
('Valami...', ['Valami', '...']),
|
||||
('Valami ...', ['Valami', '...']),
|
||||
('Valami ... más.', ['Valami', '...', 'más', '.'])
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def HU():
|
||||
return Hungarian()
|
||||
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS # + NUMBER_TESTS + HYPHEN_TESTS
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def hu_tokenizer(HU):
|
||||
return HU.tokenizer
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("input", "expected_tokens"),
|
||||
_DEFAULT_TESTS + _HYPHEN_TESTS + _NUMBER_TESTS + _DOT_TESTS + _QUOTE_TESTS)
|
||||
def test_testcases(hu_tokenizer, input, expected_tokens):
|
||||
tokens = hu_tokenizer(input)
|
||||
token_list = [token.orth_ for token in tokens if not token.is_space]
|
||||
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||
def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):
|
||||
tokens = hu_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
||||
|
|
16
spacy/tests/regression/test_issue351.py
Normal file
16
spacy/tests/regression/test_issue351.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
from __future__ import unicode_literals
|
||||
from ...en import English
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def en_tokenizer():
|
||||
return English.Defaults.create_tokenizer()
|
||||
|
||||
|
||||
def test_issue351(en_tokenizer):
|
||||
doc = en_tokenizer(" This is a cat.")
|
||||
assert doc[0].idx == 0
|
||||
assert len(doc[0]) == 3
|
||||
assert doc[1].idx == 3
|
14
spacy/tests/regression/test_issue360.py
Normal file
14
spacy/tests/regression/test_issue360.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
from __future__ import unicode_literals
|
||||
from ...en import English
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def en_tokenizer():
|
||||
return English.Defaults.create_tokenizer()
|
||||
|
||||
|
||||
def test_big_ellipsis(en_tokenizer):
|
||||
tokens = en_tokenizer(u'$45...............Asking')
|
||||
assert len(tokens) > 2
|
|
@ -1,4 +0,0 @@
|
|||
The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ]
|
||||
|
||||
The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ]
|
||||
Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ]
|
|
@ -1,7 +1,23 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.en import English
|
||||
|
||||
from ...en import English
|
||||
from ...de import German
|
||||
from ...es import Spanish
|
||||
from ...it import Italian
|
||||
from ...fr import French
|
||||
from ...pt import Portuguese
|
||||
from ...nl import Dutch
|
||||
from ...sv import Swedish
|
||||
from ...hu import Hungarian
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def en_tokenizer(EN):
|
||||
return EN.tokenizer
|
||||
LANGUAGES = [English, German, Spanish, Italian, French, Dutch, Swedish, Hungarian]
|
||||
|
||||
|
||||
@pytest.fixture(params=LANGUAGES)
|
||||
def tokenizer(request):
|
||||
lang = request.param
|
||||
return lang.Defaults.create_tokenizer()
|
||||
|
|
|
@ -1,58 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
|
||||
def test_possess(en_tokenizer):
|
||||
tokens = en_tokenizer("Mike's")
|
||||
assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike"
|
||||
assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s"
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
def test_apostrophe(en_tokenizer):
|
||||
tokens = en_tokenizer("schools'")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].orth_ == "'"
|
||||
assert tokens[0].orth_ == "schools"
|
||||
|
||||
|
||||
def test_LL(en_tokenizer):
|
||||
tokens = en_tokenizer("we'll")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].orth_ == "'ll"
|
||||
assert tokens[1].lemma_ == "will"
|
||||
assert tokens[0].orth_ == "we"
|
||||
|
||||
|
||||
def test_aint(en_tokenizer):
|
||||
tokens = en_tokenizer("ain't")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].orth_ == "ai"
|
||||
assert tokens[0].lemma_ == "be"
|
||||
assert tokens[1].orth_ == "n't"
|
||||
assert tokens[1].lemma_ == "not"
|
||||
|
||||
def test_capitalized(en_tokenizer):
|
||||
tokens = en_tokenizer("can't")
|
||||
assert len(tokens) == 2
|
||||
tokens = en_tokenizer("Can't")
|
||||
assert len(tokens) == 2
|
||||
tokens = en_tokenizer("Ain't")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].orth_ == "Ai"
|
||||
assert tokens[0].lemma_ == "be"
|
||||
|
||||
|
||||
def test_punct(en_tokenizer):
|
||||
tokens = en_tokenizer("We've")
|
||||
assert len(tokens) == 2
|
||||
tokens = en_tokenizer("``We've")
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_therell(en_tokenizer):
|
||||
tokens = en_tokenizer("there'll")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == "there"
|
||||
assert tokens[1].text == "there"
|
|
@ -1,35 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
|
||||
def test_tweebo_challenge(en_tokenizer):
|
||||
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].orth_ == ":o"
|
||||
assert tokens[1].orth_ == ":/"
|
||||
assert tokens[2].orth_ == ":'("
|
||||
assert tokens[3].orth_ == ">:o"
|
||||
assert tokens[4].orth_ == "(:"
|
||||
assert tokens[5].orth_ == ":)"
|
||||
assert tokens[6].orth_ == ">.<"
|
||||
assert tokens[7].orth_ == "XD"
|
||||
assert tokens[8].orth_ == "-__-"
|
||||
assert tokens[9].orth_ == "o.O"
|
||||
assert tokens[10].orth_ == ";D"
|
||||
assert tokens[11].orth_ == ":-)"
|
||||
assert tokens[12].orth_ == "@_@"
|
||||
assert tokens[13].orth_ == ":P"
|
||||
assert tokens[14].orth_ == "8D"
|
||||
assert tokens[15].orth_ == ":1"
|
||||
assert tokens[16].orth_ == ">:("
|
||||
assert tokens[17].orth_ == ":D"
|
||||
assert tokens[18].orth_ == "=|"
|
||||
assert tokens[19].orth_ == '")'
|
||||
assert tokens[20].orth_ == ':>'
|
||||
assert tokens[21].orth_ == '....'
|
||||
|
||||
|
||||
def test_false_positive(en_tokenizer):
|
||||
text = "example:)"
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
41
spacy/tests/tokenizer/test_exceptions.py
Normal file
41
spacy/tests/tokenizer/test_exceptions.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer exceptions and emoticons are handled correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_tokenizer_handles_emoticons(tokenizer):
|
||||
# Tweebo challenge (CMU)
|
||||
text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
|
||||
tokens = tokenizer(text)
|
||||
assert tokens[0].text == ":o"
|
||||
assert tokens[1].text == ":/"
|
||||
assert tokens[2].text == ":'("
|
||||
assert tokens[3].text == ">:o"
|
||||
assert tokens[4].text == "(:"
|
||||
assert tokens[5].text == ":)"
|
||||
assert tokens[6].text == ">.<"
|
||||
assert tokens[7].text == "XD"
|
||||
assert tokens[8].text == "-__-"
|
||||
assert tokens[9].text == "o.O"
|
||||
assert tokens[10].text == ";D"
|
||||
assert tokens[11].text == ":-)"
|
||||
assert tokens[12].text == "@_@"
|
||||
assert tokens[13].text == ":P"
|
||||
assert tokens[14].text == "8D"
|
||||
assert tokens[15].text == ":1"
|
||||
assert tokens[16].text == ">:("
|
||||
assert tokens[17].text == ":D"
|
||||
assert tokens[18].text == "=|"
|
||||
assert tokens[19].text == '")'
|
||||
assert tokens[20].text == ':>'
|
||||
assert tokens[21].text == '....'
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
|
||||
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == length
|
|
@ -1,62 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
def test_hyphen(en_tokenizer):
|
||||
tokens = en_tokenizer('best-known')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_numeric_range(en_tokenizer):
|
||||
tokens = en_tokenizer('0.1-13.5')
|
||||
assert len(tokens) == 3
|
||||
|
||||
def test_period(en_tokenizer):
|
||||
tokens = en_tokenizer('best.Known')
|
||||
assert len(tokens) == 3
|
||||
tokens = en_tokenizer('zombo.com')
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_ellipsis(en_tokenizer):
|
||||
tokens = en_tokenizer('best...Known')
|
||||
assert len(tokens) == 3
|
||||
tokens = en_tokenizer('best...known')
|
||||
assert len(tokens) == 3
|
||||
|
||||
def test_big_ellipsis(en_tokenizer):
|
||||
'''Test regression identified in Issue #360'''
|
||||
tokens = en_tokenizer(u'$45...............Asking')
|
||||
assert len(tokens) > 2
|
||||
|
||||
|
||||
|
||||
def test_email(en_tokenizer):
|
||||
tokens = en_tokenizer('hello@example.com')
|
||||
assert len(tokens) == 1
|
||||
tokens = en_tokenizer('hi+there@gmail.it')
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_double_hyphen(en_tokenizer):
|
||||
tokens = en_tokenizer(u'No decent--let alone well-bred--people.')
|
||||
assert tokens[0].text == u'No'
|
||||
assert tokens[1].text == u'decent'
|
||||
assert tokens[2].text == u'--'
|
||||
assert tokens[3].text == u'let'
|
||||
assert tokens[4].text == u'alone'
|
||||
assert tokens[5].text == u'well'
|
||||
assert tokens[6].text == u'-'
|
||||
# TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter
|
||||
# on infixes.
|
||||
assert tokens[7].text == u'bred'
|
||||
assert tokens[8].text == u'--'
|
||||
assert tokens[9].text == u'people'
|
||||
|
||||
|
||||
def test_infix_comma(en_tokenizer):
|
||||
# Re issue #326
|
||||
tokens = en_tokenizer(u'Hello,world')
|
||||
assert tokens[0].text == u'Hello'
|
||||
assert tokens[1].text == u','
|
||||
assert tokens[2].text == u'world'
|
|
@ -1,9 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
def test_only_pre1(en_tokenizer):
|
||||
assert len(en_tokenizer("(")) == 1
|
||||
|
||||
|
||||
def test_only_pre2(en_tokenizer):
|
||||
assert len(en_tokenizer("((")) == 2
|
|
@ -1,43 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def close_puncts():
|
||||
return [')', ']', '}', '*']
|
||||
|
||||
|
||||
def test_close(close_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for p in close_puncts:
|
||||
string = word_str + p
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].string == p
|
||||
assert tokens[0].string == word_str
|
||||
|
||||
|
||||
def test_two_different_close(close_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for p in close_puncts:
|
||||
string = word_str + p + "'"
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].string == word_str
|
||||
assert tokens[1].string == p
|
||||
assert tokens[2].string == "'"
|
||||
|
||||
|
||||
def test_three_same_close(close_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for p in close_puncts:
|
||||
string = word_str + p + p + p
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].string == word_str
|
||||
assert tokens[1].string == p
|
||||
|
||||
|
||||
def test_double_end_quote(en_tokenizer):
|
||||
assert len(en_tokenizer("Hello''")) == 2
|
||||
assert len(en_tokenizer("''")) == 1
|
|
@ -1,46 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def open_puncts():
|
||||
return ['(', '[', '{', '*']
|
||||
|
||||
|
||||
def test_open(open_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for p in open_puncts:
|
||||
string = p + word_str
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].orth_ == p
|
||||
assert tokens[1].orth_ == word_str
|
||||
|
||||
|
||||
def test_two_different_open(open_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for p in open_puncts:
|
||||
string = p + "`" + word_str
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].orth_ == p
|
||||
assert tokens[1].orth_ == "`"
|
||||
assert tokens[2].orth_ == word_str
|
||||
|
||||
|
||||
def test_three_same_open(open_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for p in open_puncts:
|
||||
string = p + p + p + word_str
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].orth_ == p
|
||||
assert tokens[3].orth_ == word_str
|
||||
|
||||
|
||||
def test_open_appostrophe(en_tokenizer):
|
||||
string = "'The"
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].orth_ == "'"
|
|
@ -1,46 +0,0 @@
|
|||
"""Test entries in the tokenization special-case interacting with prefix
|
||||
and suffix punctuation."""
|
||||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
|
||||
def test_no_special(en_tokenizer):
|
||||
assert len(en_tokenizer("(can)")) == 3
|
||||
|
||||
|
||||
def test_no_punct(en_tokenizer):
|
||||
assert len(en_tokenizer("can't")) == 2
|
||||
|
||||
|
||||
def test_prefix(en_tokenizer):
|
||||
assert len(en_tokenizer("(can't")) == 3
|
||||
|
||||
|
||||
def test_suffix(en_tokenizer):
|
||||
assert len(en_tokenizer("can't)")) == 3
|
||||
|
||||
|
||||
def test_wrap(en_tokenizer):
|
||||
assert len(en_tokenizer("(can't)")) == 4
|
||||
|
||||
|
||||
def test_uneven_wrap(en_tokenizer):
|
||||
assert len(en_tokenizer("(can't?)")) == 5
|
||||
|
||||
|
||||
def test_prefix_interact(en_tokenizer):
|
||||
assert len(en_tokenizer("U.S.")) == 1
|
||||
assert len(en_tokenizer("us.")) == 2
|
||||
assert len(en_tokenizer("(U.S.")) == 2
|
||||
|
||||
|
||||
def test_suffix_interact(en_tokenizer):
|
||||
assert len(en_tokenizer("U.S.)")) == 2
|
||||
|
||||
|
||||
def test_even_wrap_interact(en_tokenizer):
|
||||
assert len(en_tokenizer("(U.S.)")) == 3
|
||||
|
||||
|
||||
def test_uneven_wrap_interact(en_tokenizer):
|
||||
assert len(en_tokenizer("(U.S.?)")) == 4
|
|
@ -1,9 +0,0 @@
|
|||
"""Test suspected freeing of strings"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
def test_one(en_tokenizer):
|
||||
tokens = en_tokenizer('Betty Botter bought a pound of butter.')
|
||||
assert tokens[0].orth_ == 'Betty'
|
||||
tokens2 = en_tokenizer('Betty also bought a pound of butter.')
|
||||
assert tokens2[0].orth_ == 'Betty'
|
|
@ -1,32 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def paired_puncts():
|
||||
return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||
|
||||
|
||||
def test_token(paired_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for open_, close_ in paired_puncts:
|
||||
string = open_ + word_str + close_
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].orth_ == open_
|
||||
assert tokens[1].orth_ == word_str
|
||||
assert tokens[2].orth_ == close_
|
||||
|
||||
|
||||
def test_two_different(paired_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for open_, close_ in paired_puncts:
|
||||
string = "`" + open_ + word_str + close_ + "'"
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].orth_ == "`"
|
||||
assert tokens[1].orth_ == open_
|
||||
assert tokens[2].orth_ == word_str
|
||||
assert tokens[2].orth_ == word_str
|
||||
assert tokens[3].orth_ == close_
|
||||
assert tokens[4].orth_ == "'"
|
|
@ -1,172 +1,83 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
from os import path
|
||||
|
||||
import pytest
|
||||
import io
|
||||
import pickle
|
||||
import cloudpickle
|
||||
import tempfile
|
||||
|
||||
from ... import util
|
||||
from ...language_data import TOKENIZER_PREFIXES
|
||||
from ...util import utf8open
|
||||
|
||||
en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||
|
||||
# @pytest.mark.xfail
|
||||
# def test_pickle(en_tokenizer):
|
||||
# file_ = io.BytesIO()
|
||||
# cloudpickle.dump(en_tokenizer, file_)
|
||||
# file_.seek(0)
|
||||
# loaded = pickle.load(file_)
|
||||
# assert loaded is not None
|
||||
|
||||
def test_pre_punct_regex():
|
||||
string = "(can't"
|
||||
match = en_search_prefixes(string)
|
||||
assert match.group() == "("
|
||||
|
||||
def test_no_word(en_tokenizer):
|
||||
tokens = en_tokenizer(u'')
|
||||
def test_tokenizer_handles_no_word(tokenizer):
|
||||
tokens = tokenizer("")
|
||||
assert len(tokens) == 0
|
||||
|
||||
|
||||
def test_single_word(en_tokenizer):
|
||||
tokens = en_tokenizer(u'hello')
|
||||
assert tokens[0].orth_ == 'hello'
|
||||
@pytest.mark.parametrize('text', ["lorem"])
|
||||
def test_tokenizer_handles_single_word(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert tokens[0].text == text
|
||||
|
||||
|
||||
def test_two_words(en_tokenizer):
|
||||
tokens = en_tokenizer('hello possums')
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].orth_ != tokens[1].orth_
|
||||
|
||||
|
||||
def test_punct(en_tokenizer):
|
||||
tokens = en_tokenizer('hello, possums.')
|
||||
def test_tokenizer_handles_punct(tokenizer):
|
||||
text = "Lorem, ipsum."
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].orth_ == 'hello'
|
||||
assert tokens[1].orth_ == ','
|
||||
assert tokens[2].orth_ == 'possums'
|
||||
assert tokens[1].orth_ != 'hello'
|
||||
assert tokens[0].text == "Lorem"
|
||||
assert tokens[1].text == ","
|
||||
assert tokens[2].text == "ipsum"
|
||||
assert tokens[1].text != "Lorem"
|
||||
|
||||
|
||||
def test_digits(en_tokenizer):
|
||||
tokens = en_tokenizer('The year: 1984.')
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].orth == en_tokenizer.vocab['The'].orth
|
||||
assert tokens[3].orth == en_tokenizer.vocab['1984'].orth
|
||||
def test_tokenizer_handles_digits(tokenizer):
|
||||
exceptions = ["hu"]
|
||||
text = "Lorem ipsum: 1984."
|
||||
tokens = tokenizer(text)
|
||||
|
||||
if tokens[0].lang_ not in exceptions:
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].text == "Lorem"
|
||||
assert tokens[3].text == "1984"
|
||||
|
||||
|
||||
def test_contraction(en_tokenizer):
|
||||
tokens = en_tokenizer("don't giggle")
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].orth == en_tokenizer.vocab["n't"].orth
|
||||
tokens = en_tokenizer("i said don't!")
|
||||
assert len(tokens) == 5
|
||||
assert tokens[4].orth == en_tokenizer.vocab['!'].orth
|
||||
|
||||
def test_contraction_punct(en_tokenizer):
|
||||
tokens = [w.text for w in en_tokenizer("(can't")]
|
||||
assert tokens == ['(', 'ca', "n't"]
|
||||
tokens = en_tokenizer("`ain't")
|
||||
assert len(tokens) == 3
|
||||
tokens = en_tokenizer('''"isn't''')
|
||||
assert len(tokens) == 3
|
||||
tokens = en_tokenizer("can't!")
|
||||
assert len(tokens) == 3
|
||||
@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
|
||||
def test_tokenizer_keep_urls(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_sample(en_tokenizer):
|
||||
text = """Tributes pour in for late British Labour Party leader
|
||||
@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
|
||||
def test_tokenizer_keeps_email(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
Tributes poured in from around the world Thursday
|
||||
to the late Labour Party leader John Smith, who died earlier from a massive
|
||||
heart attack aged 55.
|
||||
|
||||
In Washington, the US State Department issued a statement regretting "the
|
||||
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
||||
def test_tokenizer_handles_long_text(tokenizer):
|
||||
text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit
|
||||
|
||||
"Mr. Smith, throughout his distinguished"""
|
||||
Cras egestas orci non porttitor maximus.
|
||||
Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate.
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris.
|
||||
|
||||
"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo."""
|
||||
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) > 5
|
||||
|
||||
|
||||
def test_cnts1(en_tokenizer):
|
||||
text = u"""The U.S. Army likes Shock and Awe."""
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 8
|
||||
@pytest.mark.parametrize('file_name', ["sun.txt"])
|
||||
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
|
||||
loc = path.join(path.dirname(__file__), '..', file_name)
|
||||
text = utf8open(loc).read()
|
||||
assert len(text) != 0
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) > 100
|
||||
|
||||
|
||||
def test_cnts2(en_tokenizer):
|
||||
text = u"""U.N. regulations are not a part of their concern."""
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 10
|
||||
|
||||
|
||||
def test_cnts3(en_tokenizer):
|
||||
text = u"“Isn't it?”"
|
||||
tokens = en_tokenizer(text)
|
||||
words = [t.orth_ for t in tokens]
|
||||
assert len(words) == 6
|
||||
|
||||
|
||||
def test_cnts4(en_tokenizer):
|
||||
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
|
||||
tokens = en_tokenizer(text)
|
||||
words = [t.orth_ for t in tokens]
|
||||
assert len(words) == 15
|
||||
|
||||
|
||||
def test_cnts5(en_tokenizer):
|
||||
text = """'Me too!', Mr. P. Delaware cried. """
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 11
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_mr(en_tokenizer):
|
||||
text = """Today is Tuesday.Mr."""
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
|
||||
|
||||
|
||||
def test_cnts6(en_tokenizer):
|
||||
text = u'They ran about 10km.'
|
||||
tokens = en_tokenizer(text)
|
||||
words = [t.orth_ for t in tokens]
|
||||
assert len(words) == 6
|
||||
|
||||
def test_bracket_period(en_tokenizer):
|
||||
text = u'(And a 6a.m. run through Washington Park).'
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[len(tokens) - 1].orth_ == u'.'
|
||||
|
||||
|
||||
def test_ie(en_tokenizer):
|
||||
text = u"It's mediocre i.e. bad."
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 6
|
||||
assert tokens[3].orth_ == "i.e."
|
||||
|
||||
|
||||
def test_two_whitespace(en_tokenizer):
|
||||
orig_str = u'there are 2 spaces after this '
|
||||
tokens = en_tokenizer(orig_str)
|
||||
assert repr(tokens.text_with_ws) == repr(orig_str)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_em_dash_infix(en_tokenizer):
|
||||
# Re Issue #225
|
||||
tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, '''
|
||||
'''you'll have to walk there.\u2014Ariel.''')
|
||||
assert tokens[6].text == 'Puddleton'
|
||||
assert tokens[7].text == '?'
|
||||
assert tokens[8].text == '\u2014'
|
||||
|
||||
#def test_cnts7():
|
||||
# text = 'But then the 6,000-year ice age came...'
|
||||
# tokens = EN.tokenize(text)
|
||||
# assert len(tokens) == 10
|
||||
def test_tokenizer_suspected_freeing_strings(tokenizer):
|
||||
text1 = "Lorem dolor sit amet, consectetur adipiscing elit."
|
||||
text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
|
||||
tokens1 = tokenizer(text1)
|
||||
tokens2 = tokenizer(text2)
|
||||
assert tokens1[0].text == "Lorem"
|
||||
assert tokens2[0].text == "Lorem"
|
||||
|
|
|
@ -1,67 +1,51 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokens are created correctly for whitespace."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_single_space(en_tokenizer):
|
||||
tokens = en_tokenizer('hello possums')
|
||||
@pytest.mark.parametrize('text', ["lorem ipsum"])
|
||||
def test_tokenizer_splits_single_space(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
def test_double_space(en_tokenizer):
|
||||
tokens = en_tokenizer('hello possums')
|
||||
@pytest.mark.parametrize('text', ["lorem ipsum"])
|
||||
def test_tokenizer_splits_double_space(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].orth_ == ' '
|
||||
assert tokens[1].text == " "
|
||||
|
||||
|
||||
def test_newline(en_tokenizer):
|
||||
tokens = en_tokenizer('hello\npossums')
|
||||
@pytest.mark.parametrize('text', ["lorem ipsum "])
|
||||
def test_tokenizer_handles_double_trainling_ws(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert repr(tokens.text_with_ws) == repr(text)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["lorem\nipsum"])
|
||||
def test_tokenizer_splits_newline(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].text == "\n"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["lorem \nipsum"])
|
||||
def test_tokenizer_splits_newline_space(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_newline_space(en_tokenizer):
|
||||
tokens = en_tokenizer('hello \npossums')
|
||||
@pytest.mark.parametrize('text', ["lorem \nipsum"])
|
||||
def test_tokenizer_splits_newline_double_space(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_newline_double_space(en_tokenizer):
|
||||
tokens = en_tokenizer('hello \npossums')
|
||||
@pytest.mark.parametrize('text', ["lorem \n ipsum"])
|
||||
def test_tokenizer_splits_newline_space_wrap(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_newline_space_wrap(en_tokenizer):
|
||||
tokens = en_tokenizer('hello \n possums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_leading_space_offsets(en_tokenizer):
|
||||
'''Issue #351
|
||||
# this works
|
||||
|
||||
text1 = u"This is a cat."
|
||||
a = english_spacy(text1)
|
||||
|
||||
tok0 = list(a.sents)[0][0]
|
||||
print tok0, tok0.idx, text1[tok0.idx]
|
||||
|
||||
tok1 = list(a.sents)[0][1]
|
||||
print tok1, tok1.idx, text1[tok1.idx]
|
||||
|
||||
print "=="
|
||||
|
||||
# this does not work
|
||||
|
||||
text2 = u" This is a cat."
|
||||
b = english_spacy(text2)
|
||||
|
||||
tok0 = list(b.sents)[0][0]
|
||||
print tok0, tok0.idx, text2[tok0.idx]
|
||||
|
||||
tok1 = list(b.sents)[0][1]
|
||||
print tok1, tok1.idx, text2[tok1.idx]
|
||||
'''
|
||||
doc = en_tokenizer(u" This is a cat.")
|
||||
assert doc[0].idx == 0
|
||||
assert len(doc[0]) == 3
|
||||
assert doc[1].idx == 3
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.util import utf8open
|
||||
|
||||
import pytest
|
||||
from os import path
|
||||
|
||||
|
||||
HERE = path.dirname(__file__)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sun_txt():
|
||||
loc = path.join(HERE, '..', 'sun.txt')
|
||||
return utf8open(loc).read()
|
||||
|
||||
|
||||
def test_tokenize(sun_txt, en_tokenizer):
|
||||
assert len(sun_txt) != 0
|
||||
tokens = en_tokenizer(sun_txt)
|
||||
assert len(tokens) > 100
|
|
@ -1,20 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
import os
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
def nlp():
|
||||
from spacy.en import English
|
||||
if os.environ.get('SPACY_DATA'):
|
||||
data_dir = os.environ.get('SPACY_DATA')
|
||||
else:
|
||||
data_dir = True
|
||||
return English(path=data_dir)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def doc(nlp):
|
||||
for word in ['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']:
|
||||
_ = nlp.vocab[word]
|
||||
return nlp('Hello, world. Here are two sentences.')
|
|
@ -1,172 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
from spacy.attrs import HEAD
|
||||
import numpy
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_example_war_and_peace(nlp):
|
||||
# from spacy.en import English
|
||||
from spacy._doc_examples import download_war_and_peace
|
||||
|
||||
unprocessed_unicode = download_war_and_peace()
|
||||
|
||||
# nlp = English()
|
||||
# TODO: ImportError: No module named _doc_examples
|
||||
doc = nlp(unprocessed_unicode)
|
||||
|
||||
|
||||
def test_main_entry_point(nlp):
|
||||
# from spacy.en import English
|
||||
# nlp = English()
|
||||
doc = nlp('Some text.') # Applies tagger, parser, entity
|
||||
doc = nlp('Some text.', parse=False) # Applies tagger and entity, not parser
|
||||
doc = nlp('Some text.', entity=False) # Applies tagger and parser, not entity
|
||||
doc = nlp('Some text.', tag=False) # Does not apply tagger, entity or parser
|
||||
doc = nlp('') # Zero-length tokens, not an error
|
||||
# doc = nlp(b'Some text') <-- Error: need unicode
|
||||
doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_sentence_spans(nlp):
|
||||
# from spacy.en import English
|
||||
# nlp = English()
|
||||
doc = nlp("This is a sentence. Here's another...")
|
||||
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_entity_spans(nlp):
|
||||
# from spacy.en import English
|
||||
# nlp = English()
|
||||
tokens = nlp('Mr. Best flew to New York on Saturday morning.')
|
||||
ents = list(tokens.ents)
|
||||
assert ents[0].label == 346
|
||||
assert ents[0].label_ == 'PERSON'
|
||||
assert ents[0].orth_ == 'Best'
|
||||
assert ents[0].string == ents[0].string
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_noun_chunk_spans(nlp):
|
||||
# from spacy.en import English
|
||||
# nlp = English()
|
||||
doc = nlp('The sentence in this example has three noun chunks.')
|
||||
for chunk in doc.noun_chunks:
|
||||
print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
|
||||
|
||||
# NP The sentence <-- has
|
||||
# NP this example <-- in
|
||||
# NP three noun chunks <-- has
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_count_by(nlp):
|
||||
# from spacy.en import English, attrs
|
||||
# nlp = English()
|
||||
import numpy
|
||||
from spacy import attrs
|
||||
tokens = nlp('apple apple orange banana')
|
||||
assert tokens.count_by(attrs.ORTH) == {3699: 2, 3750: 1, 5965: 1}
|
||||
assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[3699],
|
||||
[3699],
|
||||
[3750],
|
||||
[5965]], dtype=numpy.int32))
|
||||
|
||||
@pytest.mark.models
|
||||
def test_read_bytes(nlp):
|
||||
from spacy.tokens.doc import Doc
|
||||
loc = 'test_serialize.bin'
|
||||
with open(loc, 'wb') as file_:
|
||||
file_.write(nlp(u'This is a document.').to_bytes())
|
||||
file_.write(nlp(u'This is another.').to_bytes())
|
||||
docs = []
|
||||
with open(loc, 'rb') as file_:
|
||||
for byte_string in Doc.read_bytes(file_):
|
||||
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
||||
assert len(docs) == 2
|
||||
|
||||
|
||||
def test_token_span(doc):
|
||||
span = doc[4:6]
|
||||
token = span[0]
|
||||
assert token.i == 4
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_example_i_like_new_york1(nlp):
|
||||
toks = nlp('I like New York in Autumn.')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def toks(nlp):
|
||||
doc = nlp('I like New York in Autumn.')
|
||||
doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
|
||||
return doc
|
||||
|
||||
|
||||
def test_example_i_like_new_york2(toks):
|
||||
i, like, new, york, in_, autumn, dot = range(len(toks))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tok(toks, tok):
|
||||
i, like, new, york, in_, autumn, dot = range(len(toks))
|
||||
return locals()[tok]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def new(toks):
|
||||
return tok(toks, "new")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def york(toks):
|
||||
return tok(toks, "york")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def autumn(toks):
|
||||
return tok(toks, "autumn")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dot(toks):
|
||||
return tok(toks, "dot")
|
||||
|
||||
|
||||
def test_example_i_like_new_york3(toks, new, york):
|
||||
assert toks[new].head.orth_ == 'York'
|
||||
assert toks[york].head.orth_ == 'like'
|
||||
|
||||
|
||||
def test_example_i_like_new_york4(toks, new, york):
|
||||
new_york = toks[new:york+1]
|
||||
assert new_york.root.orth_ == 'York'
|
||||
|
||||
|
||||
def test_example_i_like_new_york5(toks, autumn, dot):
|
||||
assert toks[autumn].head.orth_ == 'in'
|
||||
assert toks[dot].head.orth_ == 'like'
|
||||
autumn_dot = toks[autumn:]
|
||||
assert autumn_dot.root.orth_ == 'Autumn'
|
||||
|
||||
|
||||
def test_navigating_the_parse_tree_lefts(doc):
|
||||
# TODO: where does the span object come from?
|
||||
span = doc[:2]
|
||||
lefts = [span.doc[i] for i in range(0, span.start)
|
||||
if span.doc[i].head in span]
|
||||
|
||||
|
||||
def test_navigating_the_parse_tree_rights(doc):
|
||||
span = doc[:2]
|
||||
rights = [span.doc[i] for i in range(span.end, len(span.doc))
|
||||
if span.doc[i].head in span]
|
||||
|
||||
|
||||
def test_string_store(doc):
|
||||
string_store = doc.vocab.strings
|
||||
for i, string in enumerate(string_store):
|
||||
assert i == string_store[string]
|
|
@ -1,180 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
import spacy
|
||||
import os
|
||||
|
||||
|
||||
try:
|
||||
xrange
|
||||
except NameError:
|
||||
xrange = range
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def token(doc):
|
||||
return doc[0]
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_load_resources_and_process_text():
|
||||
from spacy.en import English
|
||||
nlp = English()
|
||||
doc = nlp(u'Hello, world. Here are two sentences.')
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_get_tokens_and_sentences(doc):
|
||||
token = doc[0]
|
||||
sentence = next(doc.sents)
|
||||
assert token is sentence[0]
|
||||
assert sentence.text == 'Hello, world.'
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_use_integer_ids_for_any_strings(nlp, token):
|
||||
hello_id = nlp.vocab.strings['Hello']
|
||||
hello_str = nlp.vocab.strings[hello_id]
|
||||
|
||||
assert token.orth == hello_id == 3125
|
||||
assert token.orth_ == hello_str == 'Hello'
|
||||
|
||||
|
||||
def test_get_and_set_string_views_and_flags(nlp, token):
|
||||
assert token.shape_ == 'Xxxxx'
|
||||
for lexeme in nlp.vocab:
|
||||
if lexeme.is_alpha:
|
||||
lexeme.shape_ = 'W'
|
||||
elif lexeme.is_digit:
|
||||
lexeme.shape_ = 'D'
|
||||
elif lexeme.is_punct:
|
||||
lexeme.shape_ = 'P'
|
||||
else:
|
||||
lexeme.shape_ = 'M'
|
||||
assert token.shape_ == 'W'
|
||||
|
||||
|
||||
def test_export_to_numpy_arrays(nlp, doc):
|
||||
from spacy.attrs import ORTH, LIKE_URL, IS_OOV
|
||||
|
||||
attr_ids = [ORTH, LIKE_URL, IS_OOV]
|
||||
doc_array = doc.to_array(attr_ids)
|
||||
assert doc_array.shape == (len(doc), len(attr_ids))
|
||||
assert doc[0].orth == doc_array[0, 0]
|
||||
assert doc[1].orth == doc_array[1, 0]
|
||||
assert doc[0].like_url == doc_array[0, 1]
|
||||
assert list(doc_array[:, 1]) == [t.like_url for t in doc]
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_word_vectors(nlp):
|
||||
doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
|
||||
|
||||
apples = doc[0]
|
||||
oranges = doc[2]
|
||||
boots = doc[6]
|
||||
hippos = doc[8]
|
||||
|
||||
assert apples.similarity(oranges) > boots.similarity(hippos)
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_part_of_speech_tags(nlp):
|
||||
from spacy.parts_of_speech import ADV
|
||||
|
||||
def is_adverb(token):
|
||||
return token.pos == spacy.parts_of_speech.ADV
|
||||
|
||||
# These are data-specific, so no constants are provided. You have to look
|
||||
# up the IDs from the StringStore.
|
||||
NNS = nlp.vocab.strings['NNS']
|
||||
NNPS = nlp.vocab.strings['NNPS']
|
||||
def is_plural_noun(token):
|
||||
return token.tag == NNS or token.tag == NNPS
|
||||
|
||||
def print_coarse_pos(token):
|
||||
print(token.pos_)
|
||||
|
||||
def print_fine_pos(token):
|
||||
print(token.tag_)
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_syntactic_dependencies():
|
||||
def dependency_labels_to_root(token):
|
||||
'''Walk up the syntactic tree, collecting the arc labels.'''
|
||||
dep_labels = []
|
||||
while token.head is not token:
|
||||
dep_labels.append(token.dep)
|
||||
token = token.head
|
||||
return dep_labels
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_named_entities():
|
||||
def iter_products(docs):
|
||||
for doc in docs:
|
||||
for ent in doc.ents:
|
||||
if ent.label_ == 'PRODUCT':
|
||||
yield ent
|
||||
|
||||
def word_is_in_entity(word):
|
||||
return word.ent_type != 0
|
||||
|
||||
def count_parent_verb_by_person(docs):
|
||||
counts = defaultdict(defaultdict(int))
|
||||
for doc in docs:
|
||||
for ent in doc.ents:
|
||||
if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
|
||||
counts[ent.orth_][ent.root.head.lemma_] += 1
|
||||
return counts
|
||||
|
||||
|
||||
def test_calculate_inline_mark_up_on_original_string():
|
||||
def put_spans_around_tokens(doc, get_classes):
|
||||
'''Given some function to compute class names, put each token in a
|
||||
span element, with the appropriate classes computed.
|
||||
|
||||
All whitespace is preserved, outside of the spans. (Yes, I know HTML
|
||||
won't display it. But the point is no information is lost, so you can
|
||||
calculate what you need, e.g. <br /> tags, <p> tags, etc.)
|
||||
'''
|
||||
output = []
|
||||
template = '<span classes="{classes}">{word}</span>{space}'
|
||||
for token in doc:
|
||||
if token.is_space:
|
||||
output.append(token.orth_)
|
||||
else:
|
||||
output.append(
|
||||
template.format(
|
||||
classes=' '.join(get_classes(token)),
|
||||
word=token.orth_,
|
||||
space=token.whitespace_))
|
||||
string = ''.join(output)
|
||||
string = string.replace('\n', '')
|
||||
string = string.replace('\t', ' ')
|
||||
return string
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_efficient_binary_serialization(doc):
|
||||
from spacy.tokens.doc import Doc
|
||||
|
||||
byte_string = doc.to_bytes()
|
||||
open('moby_dick.bin', 'wb').write(byte_string)
|
||||
|
||||
nlp = spacy.en.English()
|
||||
for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
|
||||
doc = Doc(nlp.vocab)
|
||||
doc.from_bytes(byte_string)
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_multithreading(nlp):
|
||||
texts = [u'One document.', u'...', u'Lots of documents']
|
||||
# .pipe streams input, and produces streaming output
|
||||
iter_texts = (texts[i % 3] for i in xrange(100000000))
|
||||
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
|
||||
assert doc.is_parsed
|
||||
if i == 100:
|
||||
break
|
||||
|
|
@ -94,8 +94,13 @@ def read_regex(path):
|
|||
|
||||
|
||||
def compile_prefix_regex(entries):
|
||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||
return re.compile(expression)
|
||||
if '(' in entries:
|
||||
# Handle deprecated data
|
||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||
return re.compile(expression)
|
||||
else:
|
||||
expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
|
||||
return re.compile(expression)
|
||||
|
||||
|
||||
def compile_suffix_regex(entries):
|
||||
|
|
|
@ -22,7 +22,8 @@
|
|||
"twitter": "spacy_io",
|
||||
"github": "explosion",
|
||||
"reddit": "spacynlp",
|
||||
"codepen": "explosion"
|
||||
"codepen": "explosion",
|
||||
"gitter": "explosion/spaCy"
|
||||
},
|
||||
|
||||
"NAVIGATION": {
|
||||
|
@ -53,7 +54,7 @@
|
|||
}
|
||||
},
|
||||
|
||||
"V_CSS": "1.10",
|
||||
"V_CSS": "1.14",
|
||||
"V_JS": "1.0",
|
||||
"DEFAULT_SYNTAX" : "python",
|
||||
"ANALYTICS": "UA-58931649-1",
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
//- 💫 MIXINS > BASE
|
||||
|
||||
//- Aside wrapper
|
||||
label - [string] aside label
|
||||
|
||||
mixin aside-wrapper(label)
|
||||
aside.c-aside
|
||||
|
@ -21,6 +22,10 @@ mixin date(input)
|
|||
|
||||
|
||||
//- SVG from map
|
||||
file - [string] SVG file name in /assets/img/
|
||||
name - [string] SVG symbol id
|
||||
width - [integer] width in px
|
||||
height - [integer] height in px (default: same as width)
|
||||
|
||||
mixin svg(file, name, width, height)
|
||||
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
|
||||
|
@ -28,19 +33,23 @@ mixin svg(file, name, width, height)
|
|||
|
||||
|
||||
//- Icon
|
||||
name - [string] icon name, should be SVG symbol ID
|
||||
size - [integer] icon width and height (default: 20)
|
||||
|
||||
mixin icon(name, size)
|
||||
+svg("icons", "icon-" + name, size || 20).o-icon&attributes(attributes)
|
||||
+svg("icons", name, size || 20).o-icon&attributes(attributes)
|
||||
|
||||
|
||||
//- Pro/Con/Neutral icon
|
||||
icon - [string] "pro", "con" or "neutral" (default: "neutral")
|
||||
|
||||
mixin procon(icon)
|
||||
- colors = { pro: "green", con: "red" }
|
||||
- colors = { pro: "green", con: "red", neutral: "yellow" }
|
||||
+icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
|
||||
|
||||
|
||||
//- Headlines Helper Mixin
|
||||
level - [integer] 1, 2, 3, 4, or 5
|
||||
|
||||
mixin headline(level)
|
||||
if level == 1
|
||||
|
@ -65,6 +74,7 @@ mixin headline(level)
|
|||
|
||||
|
||||
//- Permalink rendering
|
||||
id - [string] permalink ID used for link anchor
|
||||
|
||||
mixin permalink(id)
|
||||
if id
|
||||
|
@ -77,6 +87,7 @@ mixin permalink(id)
|
|||
|
||||
|
||||
//- Terminal-style code window
|
||||
label - [string] title displayed in top bar of terminal window
|
||||
|
||||
mixin terminal(label)
|
||||
.x-terminal
|
||||
|
@ -87,6 +98,18 @@ mixin terminal(label)
|
|||
block
|
||||
|
||||
|
||||
//- Gitter chat button and widget
|
||||
button - [string] text shown on button
|
||||
label - [string] title of chat window (default: same as button)
|
||||
|
||||
mixin gitter(button, label)
|
||||
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
|
||||
|
||||
button.js-gitter-button.c-chat__button.u-text-small
|
||||
+icon("chat").o-icon--inline
|
||||
!=button
|
||||
|
||||
|
||||
//- Logo
|
||||
|
||||
mixin logo()
|
||||
|
|
|
@ -44,7 +44,7 @@ mixin api(path)
|
|||
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block
|
||||
block
|
||||
|
||||
| #[+icon("book", 18).o-icon--inline.u-help.u-color-subtle]
|
||||
| #[+icon("book", 18).o-icon--inline.u-color-subtle]
|
||||
|
||||
|
||||
//- Aside for text
|
||||
|
|
|
@ -24,4 +24,6 @@ main.o-main.o-main--sidebar.o-main--aside
|
|||
.o-inline-list
|
||||
+button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)]
|
||||
|
||||
+gitter("spaCy chat")
|
||||
|
||||
include _footer
|
||||
|
|
23
website/_includes/_scripts.jade
Normal file
23
website/_includes/_scripts.jade
Normal file
|
@ -0,0 +1,23 @@
|
|||
//- 💫 INCLUDES > SCRIPTS
|
||||
|
||||
script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
|
||||
script(src="/assets/js/prism.js", type="text/javascript")
|
||||
|
||||
if SECTION == "docs"
|
||||
script.
|
||||
((window.gitter = {}).chat = {}).options = {
|
||||
useStyles: false,
|
||||
activationElement: '.js-gitter-button',
|
||||
targetElement: '.js-gitter',
|
||||
room: '!{SOCIAL.gitter}'
|
||||
};
|
||||
|
||||
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
|
||||
|
||||
if environment == "deploy"
|
||||
script
|
||||
| window.ga=window.ga||function(){
|
||||
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
|
||||
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
|
||||
|
||||
script(async src="https://www.google-analytics.com/analytics.js")
|
|
@ -52,13 +52,4 @@ html(lang="en")
|
|||
main!=yield
|
||||
include _includes/_footer
|
||||
|
||||
script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
|
||||
script(src="/assets/js/prism.js", type="text/javascript")
|
||||
|
||||
if environment == "deploy"
|
||||
script
|
||||
| window.ga=window.ga||function(){
|
||||
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
|
||||
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
|
||||
|
||||
script(async src="https://www.google-analytics.com/analytics.js")
|
||||
include _includes/_scripts
|
||||
|
|
|
@ -6,36 +6,36 @@
|
|||
font-family: "Source Sans Pro"
|
||||
font-style: normal
|
||||
font-weight: 400
|
||||
src: url("../fonts/sourcesanspro-regular.eot")
|
||||
src: url("../fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-regular.woff2") format("woff2"), url("../fonts/sourcesanspro-regular.woff") format("woff"), url("../fonts/sourcesanspro-regular.ttf") format("truetype"), url("../fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg")
|
||||
src: url("/assets/fonts/sourcesanspro-regular.eot")
|
||||
src: url("/assets/fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-regular.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-regular.woff") format("woff"), url("/assets/fonts/sourcesanspro-regular.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg")
|
||||
|
||||
@font-face
|
||||
font-family: "Source Sans Pro"
|
||||
font-style: italic
|
||||
font-weight: 400
|
||||
src: url("../fonts/sourcesanspro-italic.eot")
|
||||
src: url("../fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-italic.woff2") format("woff2"), url("../fonts/sourcesanspro-italic.woff") format("woff"), url("../fonts/sourcesanspro-italic.ttf") format("truetype"), url("../fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg")
|
||||
src: url("/assets/fonts/sourcesanspro-italic.eot")
|
||||
src: url("/assets/fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-italic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-italic.woff") format("woff"), url("/assets/fonts/sourcesanspro-italic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg")
|
||||
|
||||
@font-face
|
||||
font-family: "Source Sans Pro"
|
||||
font-style: normal
|
||||
font-weight: 700
|
||||
src: url("../fonts/sourcesanspro-bold.eot")
|
||||
src: url("../fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bold.woff2") format("woff2"), url("../fonts/sourcesanspro-bold.woff") format("woff"), url("../fonts/sourcesanspro-bold.ttf") format("truetype"), url("../fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg")
|
||||
src: url("/assets/fonts/sourcesanspro-bold.eot")
|
||||
src: url("/assets/fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bold.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bold.woff") format("woff"), url("/assets/fonts/sourcesanspro-bold.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg")
|
||||
|
||||
@font-face
|
||||
font-family: "Source Sans Pro"
|
||||
font-style: italic
|
||||
font-weight: 700
|
||||
src: url("../fonts/sourcesanspro-bolditalic.eot")
|
||||
src: url("../fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("../fonts/sourcesanspro-bolditalic.woff") format("woff"), url("../fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("../fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg")
|
||||
src: url("/assets/fonts/sourcesanspro-bolditalic.eot")
|
||||
src: url("/assets/fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bolditalic.woff") format("woff"), url("/assets/fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg")
|
||||
|
||||
|
||||
// Source Code Pro
|
||||
|
||||
@font-face
|
||||
font-family: "Source Code Pro"
|
||||
font-style: normal
|
||||
font-weight: 600
|
||||
src: url("../fonts/sourcecodepro-semibold.eot")
|
||||
src: url("../fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcecodepro-semibold.woff") format("woff"), url("../fonts/sourcecodepro-semibold.ttf") format("truetype"), url("../fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg")
|
||||
font-family: "Source Code Pro"
|
||||
font-style: normal
|
||||
font-weight: 600
|
||||
src: url("/assets/fonts/sourcecodepro-semibold.eot")
|
||||
src: url("/assets/fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcecodepro-semibold.woff") format("woff"), url("/assets/fonts/sourcecodepro-semibold.ttf") format("truetype"), url("/assets/fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg")
|
||||
|
|
|
@ -60,7 +60,7 @@
|
|||
background: $color-back
|
||||
border-radius: 2px
|
||||
border: 1px solid $color-subtle
|
||||
padding: 3.5% 2.5%
|
||||
padding: 3rem 2.5%
|
||||
|
||||
//- Icons
|
||||
|
||||
|
|
|
@ -141,12 +141,6 @@
|
|||
background: $pattern
|
||||
|
||||
|
||||
//- Cursors
|
||||
|
||||
.u-help
|
||||
cursor: help
|
||||
|
||||
|
||||
//- Hidden elements
|
||||
|
||||
.u-hidden
|
||||
|
|
100
website/assets/css/_components/_chat.sass
Normal file
100
website/assets/css/_components/_chat.sass
Normal file
|
@ -0,0 +1,100 @@
|
|||
//- 💫 CSS > COMPONENTS > CHAT
|
||||
|
||||
.c-chat
|
||||
@include position(fixed, top, left, 0, 60%)
|
||||
bottom: 0
|
||||
right: 0
|
||||
display: flex
|
||||
flex-flow: column nowrap
|
||||
background: $color-back
|
||||
transition: transform 0.3s cubic-bezier(0.16, 0.22, 0.22, 1.7)
|
||||
box-shadow: -0.25rem 0 1rem 0 rgba($color-front, 0.25)
|
||||
z-index: 100
|
||||
|
||||
@include breakpoint(min, md)
|
||||
left: calc(100% - #{$aside-width} - #{$aside-padding})
|
||||
|
||||
@include breakpoint(max, sm)
|
||||
left: 50%
|
||||
|
||||
@include breakpoint(max, xs)
|
||||
left: 0
|
||||
|
||||
&.is-collapsed:not(.is-loading)
|
||||
transform: translateX(110%)
|
||||
|
||||
&:before
|
||||
@include position(absolute, top, left, 1rem, 2rem)
|
||||
content: attr(data-title)
|
||||
font: bold 1.4rem $font-code
|
||||
text-transform: uppercase
|
||||
color: $color-back
|
||||
|
||||
&:after
|
||||
@include position(absolute, top, left, 0, 100%)
|
||||
content: ""
|
||||
z-index: -1
|
||||
bottom: 0
|
||||
right: -100%
|
||||
background: $color-back
|
||||
|
||||
& > iframe
|
||||
width: 100%
|
||||
flex: 1 1 calc(100% - #{$nav-height})
|
||||
border: 0
|
||||
|
||||
.gitter-chat-embed-loading-wrapper
|
||||
@include position(absolute, top, left, 0, 0)
|
||||
right: 0
|
||||
bottom: 0
|
||||
display: none
|
||||
justify-content: center
|
||||
align-items: center
|
||||
|
||||
.is-loading &
|
||||
display: flex
|
||||
|
||||
.gitter-chat-embed-action-bar,
|
||||
.gitter-chat-embed-action-bar-item
|
||||
display: flex
|
||||
|
||||
.gitter-chat-embed-action-bar
|
||||
align-items: center
|
||||
justify-content: flex-end
|
||||
background: $color-theme
|
||||
padding: 0 1rem 0 2rem
|
||||
flex: 0 0 $nav-height
|
||||
|
||||
.gitter-chat-embed-action-bar-item
|
||||
@include size(40px)
|
||||
padding: 0
|
||||
opacity: 0.75
|
||||
background-position: 50%
|
||||
background-repeat: no-repeat
|
||||
background-size: 22px 22px
|
||||
border: 0
|
||||
cursor: pointer
|
||||
transition: all 0.2s ease
|
||||
|
||||
&:focus,
|
||||
&:hover
|
||||
opacity: 1
|
||||
|
||||
&.gitter-chat-embed-action-bar-item-pop-out
|
||||
background-image: url()
|
||||
margin-right: -4px
|
||||
|
||||
&.gitter-chat-embed-action-bar-item-collapse-chat
|
||||
background-image: url()
|
||||
|
||||
.c-chat__button
|
||||
@include position(fixed, bottom, right, 0, 2rem)
|
||||
padding: 1rem 1.5rem
|
||||
background: $color-front
|
||||
color: $color-back
|
||||
border-top-left-radius: 4px
|
||||
border-top-right-radius: 4px
|
||||
z-index: 20
|
||||
border-color: $color-theme
|
||||
border-style: solid
|
||||
border-width: 1px 1px 0 1px
|
|
@ -24,6 +24,7 @@ $theme: blue !default
|
|||
|
||||
@import _components/asides
|
||||
@import _components/buttons
|
||||
@import _components/chat
|
||||
@import _components/code
|
||||
@import _components/landing
|
||||
@import _components/lists
|
||||
|
|
|
@ -64,5 +64,6 @@
|
|||
<symbol id="matt-signature" viewBox="0 0 500 250">
|
||||
<title>matt-signature</title>
|
||||
<path fill="currentColor" d="M18.6 207c-.3-18.8-.8-37.5-1.4-56.2-.6-18.7-1-37.5-1-56.2v-7.2c0-3.5 0-7 .2-11v-18c.8-2.7 1.8-5 3-6.5 1.6-2 3.6-3 6.4-3 3 0 5.4 1 7.6 2 2.2 2 4 4 5.3 6l36.6 71 1.8 3c1 1 2 3 3 3h1l1 1 1-3 22-76c2-3 3-5 4-8l2-9c1-3 2-6 4-8 1-3 4-5 7-7h2c5 0 8 1 10 4 3 2 4 5 5 9 1 3 2 7 1 12v11l1 7c0 3 0 7 1 12 0 4 1 9 1 14l1 14.2 1 12 .6 6v1l1 7.5 1 11.6 1.4 12 1.4 8 1 4 1.7 5.5 1.7 6c.7 1.7 1 3 1.5 3.6-.5 4-1.5 7-3 9-1 2-4 3-8 3h-6l-3-3c-1-1.4-2-2.3-2-3l-4-14-7.6-58V88c0-3.5-1-7-2-10l-2 1.7-18 74v6c0 2-.2 4-1 6 0 2-1 3.5-3 5-1 1.3-3 2-5 2.2-1 0-2 0-3-1l-3.4-2-3-3c-1-1-1.7-2-2-3l-35-52-5.3-10.6v22c0 10.2.2 20.3.6 30.2.4 10 .6 20 .6 30.2v22c0 2-1 4-3 5.4s-3 3-5 3c-3 0-5 0-7-1-1-1-3-3-4-5zm205-63.2c-1.6 2.7-3.4 6-5.3 9.8l-6.2 12.2c-2 4.3-4 8.6-7 13-2 4.2-5 8.2-8 11.7s-5 6.6-9 9c-3 2.5-6 4-9 4.4-1 0-3-1-4-1l-5-2c-1-1-3-2-4-3s-1-3-1-5c1-18 2-33 4-47s6-27 11-38 12-20 20-27 18-12 29-15l2-1h2c5 0 9 2 11 7s4 12 5 23c1 10 2 24 2 40 1 16 2 36 3 59l1 4v5c0 2.6-1 4.5-2 6s-3 2-5 2c-5 0-8-1.7-10-4s-3-6.6-4-11v-4l-1-9s-1-6.7-1-10l-1-8.5v-1l-.2-6-1-7-.5-8.6-1-1zM218 93.5c-4.7 3.4-9.2 8-13.6 13.7-4.4 5.8-7.5 11.3-9.4 16.8-.8 2.5-1.8 6-2.8 10.4-1 4.4-2 8.8-2.7 13l-2 12-.7 7c.2 0 .4-.2.6-.5l.6-1c10.5-10 18-21 22.2-33 4.6-12 7-25 7.7-39zm72 47c-2.3 0-4.4.6-6.2 1.8-2 1.2-4 1.8-6.6 1.8h-5.4c-.7-1-1.4-1-2.3-2l-2.5-2c-.8 0-1.6-1-2.2-2-.6-1-1-2-1-3 0-2 1-4 3-6 2-1 4.5-3 7.2-4l8.3-3s5-2 6.7-3v-11c0-12-.6-25-1.8-38-1.2-12-1.8-25-1.8-37 0-3 .8-6 2.5-7 1-1 4-1 6-1 3 0 6 1 7 3s2 4 3 7c0 3 1 6 1 9v20l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2.3 9-3.4 14-3.4 3 0 6 1 7 3.5s3 5 3 8c0 2-1 4-3 5l-6 3-46 17-1.5 1s-1 0-1.5 1v8c0 6 0 12 .5 18s1 12.3 2 18.3l3 15c1 5 1.4 10 1.4 15 0 1.4-.6 3.5-1.6 6s-2 4-4.7 4c-5 0-8.7-1.6-11.6-4-3-3-4.3-6.6-4.6-11l-2.2-29-2.7-30h-1zm112 0c-2.4 0-4.5.6-6.3 1.8-2 1.2-4 1.8-6.6 1.8h-5c0-1-1-1-2-2l-2-2c-1 0-1-1-2-2 0-1-1-2-1-3 0-2 1-4 3-6 2-1 5-3 7-4l8-3s5-2 7-3v-11c0-12 0-25-2-38-1-12-1-25-1-37 0-3 1-6 3-7s4-1 7-1c4 0 6 1 8 3s3 4 3 7c1 3 1 6 1 9s0 6 1 8v11l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2 9-4 14-4 4 0 6 1 8 4s3 5 3 8c0 2-1 4-2 5l-5.3 3-49 13.8-1.5 1s-1 .5-1.5 1V157l1 18.3c0 5 1 10 2 15s1 10 1 15c0 1.5-1 3.6-2 6s-3 4-5 4c-5 0-9-1.5-12-4.2s-5-6-5-11l-3-28.3-3-30.3h-1z"/>
|
||||
</symbol>
|
||||
</defs>
|
||||
</svg>
|
||||
|
|
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 15 KiB |
|
@ -1,32 +1,28 @@
|
|||
<svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||
<defs>
|
||||
<symbol id="icon-github" viewBox="0 0 27 32">
|
||||
<title>github</title>
|
||||
<path class="path1" d="M13.714 2.286q3.732 0 6.884 1.839t4.991 4.991 1.839 6.884q0 4.482-2.616 8.063t-6.759 4.955q-0.482 0.089-0.714-0.125t-0.232-0.536q0-0.054 0.009-1.366t0.009-2.402q0-1.732-0.929-2.536 1.018-0.107 1.83-0.321t1.679-0.696 1.446-1.188 0.946-1.875 0.366-2.688q0-2.125-1.411-3.679 0.661-1.625-0.143-3.643-0.5-0.161-1.446 0.196t-1.643 0.786l-0.679 0.429q-1.661-0.464-3.429-0.464t-3.429 0.464q-0.286-0.196-0.759-0.482t-1.491-0.688-1.518-0.241q-0.804 2.018-0.143 3.643-1.411 1.554-1.411 3.679 0 1.518 0.366 2.679t0.938 1.875 1.438 1.196 1.679 0.696 1.83 0.321q-0.696 0.643-0.875 1.839-0.375 0.179-0.804 0.268t-1.018 0.089-1.17-0.384-0.991-1.116q-0.339-0.571-0.866-0.929t-0.884-0.429l-0.357-0.054q-0.375 0-0.518 0.080t-0.089 0.205 0.161 0.25 0.232 0.214l0.125 0.089q0.393 0.179 0.777 0.679t0.563 0.911l0.179 0.411q0.232 0.679 0.786 1.098t1.196 0.536 1.241 0.125 0.991-0.063l0.411-0.071q0 0.679 0.009 1.58t0.009 0.973q0 0.321-0.232 0.536t-0.714 0.125q-4.143-1.375-6.759-4.955t-2.616-8.063q0-3.732 1.839-6.884t4.991-4.991 6.884-1.839zM5.196 21.982q0.054-0.125-0.125-0.214-0.179-0.054-0.232 0.036-0.054 0.125 0.125 0.214 0.161 0.107 0.232-0.036zM5.75 22.589q0.125-0.089-0.036-0.286-0.179-0.161-0.286-0.054-0.125 0.089 0.036 0.286 0.179 0.179 0.286 0.054zM6.286 23.393q0.161-0.125 0-0.339-0.143-0.232-0.304-0.107-0.161 0.089 0 0.321t0.304 0.125zM7.036 24.143q0.143-0.143-0.071-0.339-0.214-0.214-0.357-0.054-0.161 0.143 0.071 0.339 0.214 0.214 0.357 0.054zM8.054 24.589q0.054-0.196-0.232-0.286-0.268-0.071-0.339 0.125t0.232 0.268q0.268 0.107 0.339-0.107zM9.179 24.679q0-0.232-0.304-0.196-0.286 0-0.286 0.196 0 0.232 0.304 0.196 0.286 0 0.286-0.196zM10.214 24.5q-0.036-0.196-0.321-0.161-0.286 0.054-0.25 0.268t0.321 0.143 0.25-0.25z"></path>
|
||||
<symbol id="github" viewBox="0 0 27 32">
|
||||
<path d="M13.714 2.286q3.732 0 6.884 1.839t4.991 4.991 1.839 6.884q0 4.482-2.616 8.063t-6.759 4.955q-0.482 0.089-0.714-0.125t-0.232-0.536q0-0.054 0.009-1.366t0.009-2.402q0-1.732-0.929-2.536 1.018-0.107 1.83-0.321t1.679-0.696 1.446-1.188 0.946-1.875 0.366-2.688q0-2.125-1.411-3.679 0.661-1.625-0.143-3.643-0.5-0.161-1.446 0.196t-1.643 0.786l-0.679 0.429q-1.661-0.464-3.429-0.464t-3.429 0.464q-0.286-0.196-0.759-0.482t-1.491-0.688-1.518-0.241q-0.804 2.018-0.143 3.643-1.411 1.554-1.411 3.679 0 1.518 0.366 2.679t0.938 1.875 1.438 1.196 1.679 0.696 1.83 0.321q-0.696 0.643-0.875 1.839-0.375 0.179-0.804 0.268t-1.018 0.089-1.17-0.384-0.991-1.116q-0.339-0.571-0.866-0.929t-0.884-0.429l-0.357-0.054q-0.375 0-0.518 0.080t-0.089 0.205 0.161 0.25 0.232 0.214l0.125 0.089q0.393 0.179 0.777 0.679t0.563 0.911l0.179 0.411q0.232 0.679 0.786 1.098t1.196 0.536 1.241 0.125 0.991-0.063l0.411-0.071q0 0.679 0.009 1.58t0.009 0.973q0 0.321-0.232 0.536t-0.714 0.125q-4.143-1.375-6.759-4.955t-2.616-8.063q0-3.732 1.839-6.884t4.991-4.991 6.884-1.839zM5.196 21.982q0.054-0.125-0.125-0.214-0.179-0.054-0.232 0.036-0.054 0.125 0.125 0.214 0.161 0.107 0.232-0.036zM5.75 22.589q0.125-0.089-0.036-0.286-0.179-0.161-0.286-0.054-0.125 0.089 0.036 0.286 0.179 0.179 0.286 0.054zM6.286 23.393q0.161-0.125 0-0.339-0.143-0.232-0.304-0.107-0.161 0.089 0 0.321t0.304 0.125zM7.036 24.143q0.143-0.143-0.071-0.339-0.214-0.214-0.357-0.054-0.161 0.143 0.071 0.339 0.214 0.214 0.357 0.054zM8.054 24.589q0.054-0.196-0.232-0.286-0.268-0.071-0.339 0.125t0.232 0.268q0.268 0.107 0.339-0.107zM9.179 24.679q0-0.232-0.304-0.196-0.286 0-0.286 0.196 0 0.232 0.304 0.196 0.286 0 0.286-0.196zM10.214 24.5q-0.036-0.196-0.321-0.161-0.286 0.054-0.25 0.268t0.321 0.143 0.25-0.25z"></path>
|
||||
</symbol>
|
||||
<symbol id="icon-code" viewBox="0 0 20 20">
|
||||
<title>code</title>
|
||||
<path class="path1" d="M5.719 14.75c-0.236 0-0.474-0.083-0.664-0.252l-5.060-4.498 5.341-4.748c0.412-0.365 1.044-0.33 1.411 0.083s0.33 1.045-0.083 1.412l-3.659 3.253 3.378 3.002c0.413 0.367 0.45 0.999 0.083 1.412-0.197 0.223-0.472 0.336-0.747 0.336zM14.664 14.748l5.341-4.748-5.060-4.498c-0.413-0.367-1.045-0.33-1.411 0.083s-0.33 1.045 0.083 1.412l3.378 3.003-3.659 3.252c-0.413 0.367-0.45 0.999-0.083 1.412 0.197 0.223 0.472 0.336 0.747 0.336 0.236 0 0.474-0.083 0.664-0.252zM9.986 16.165l2-12c0.091-0.545-0.277-1.060-0.822-1.151-0.547-0.092-1.061 0.277-1.15 0.822l-2 12c-0.091 0.545 0.277 1.060 0.822 1.151 0.056 0.009 0.11 0.013 0.165 0.013 0.48 0 0.904-0.347 0.985-0.835z"></path>
|
||||
<symbol id="code" viewBox="0 0 20 20">
|
||||
<path d="M5.719 14.75c-0.236 0-0.474-0.083-0.664-0.252l-5.060-4.498 5.341-4.748c0.412-0.365 1.044-0.33 1.411 0.083s0.33 1.045-0.083 1.412l-3.659 3.253 3.378 3.002c0.413 0.367 0.45 0.999 0.083 1.412-0.197 0.223-0.472 0.336-0.747 0.336zM14.664 14.748l5.341-4.748-5.060-4.498c-0.413-0.367-1.045-0.33-1.411 0.083s-0.33 1.045 0.083 1.412l3.378 3.003-3.659 3.252c-0.413 0.367-0.45 0.999-0.083 1.412 0.197 0.223 0.472 0.336 0.747 0.336 0.236 0 0.474-0.083 0.664-0.252zM9.986 16.165l2-12c0.091-0.545-0.277-1.060-0.822-1.151-0.547-0.092-1.061 0.277-1.15 0.822l-2 12c-0.091 0.545 0.277 1.060 0.822 1.151 0.056 0.009 0.11 0.013 0.165 0.013 0.48 0 0.904-0.347 0.985-0.835z"></path>
|
||||
</symbol>
|
||||
<symbol id="icon-anchor" viewBox="0 0 16 16">
|
||||
<title>anchor</title>
|
||||
<path class="path1" d="M14.779 12.779c-1.471 1.993-4.031 3.245-6.779 3.221-2.748 0.023-5.309-1.229-6.779-3.221l-1.221 1.221v-4h4l-1.1 1.099c0.882 1.46 2.357 2.509 4.1 2.807v-6.047c-1.723-0.446-3-1.997-3-3.858 0-2.209 1.791-4 4-4s4 1.791 4 4c0 1.862-1.277 3.413-3 3.858v6.047c1.742-0.297 3.218-1.347 4.099-2.807l-1.1-1.099h4v4l-1.221-1.221zM10 4c0-1.104-0.895-2-2-2s-2 0.895-2 2c0 1.104 0.895 2 2 2s2-0.896 2-2z"></path>
|
||||
<symbol id="anchor" viewBox="0 0 16 16">
|
||||
<path d="M14.779 12.779c-1.471 1.993-4.031 3.245-6.779 3.221-2.748 0.023-5.309-1.229-6.779-3.221l-1.221 1.221v-4h4l-1.1 1.099c0.882 1.46 2.357 2.509 4.1 2.807v-6.047c-1.723-0.446-3-1.997-3-3.858 0-2.209 1.791-4 4-4s4 1.791 4 4c0 1.862-1.277 3.413-3 3.858v6.047c1.742-0.297 3.218-1.347 4.099-2.807l-1.1-1.099h4v4l-1.221-1.221zM10 4c0-1.104-0.895-2-2-2s-2 0.895-2 2c0 1.104 0.895 2 2 2s2-0.896 2-2z"></path>
|
||||
</symbol>
|
||||
<symbol id="icon-book" viewBox="0 0 24 24">
|
||||
<title>book</title>
|
||||
<path class="path1" d="M18.984 6.984v-1.969h-9.984v1.969h9.984zM15 15v-2.016h-6v2.016h6zM18.984 11.016v-2.016h-9.984v2.016h9.984zM20.016 2.016c1.078 0 1.969 0.891 1.969 1.969v12c0 1.078-0.891 2.016-1.969 2.016h-12c-1.078 0-2.016-0.938-2.016-2.016v-12c0-1.078 0.938-1.969 2.016-1.969h12zM3.984 6v14.016h14.016v1.969h-14.016c-1.078 0-1.969-0.891-1.969-1.969v-14.016h1.969z"></path>
|
||||
<symbol id="book" viewBox="0 0 24 24">
|
||||
<path d="M18.984 6.984v-1.969h-9.984v1.969h9.984zM15 15v-2.016h-6v2.016h6zM18.984 11.016v-2.016h-9.984v2.016h9.984zM20.016 2.016c1.078 0 1.969 0.891 1.969 1.969v12c0 1.078-0.891 2.016-1.969 2.016h-12c-1.078 0-2.016-0.938-2.016-2.016v-12c0-1.078 0.938-1.969 2.016-1.969h12zM3.984 6v14.016h14.016v1.969h-14.016c-1.078 0-1.969-0.891-1.969-1.969v-14.016h1.969z"></path>
|
||||
</symbol>
|
||||
<symbol id="icon-pro" viewBox="0 0 20 20">
|
||||
<title>pro</title>
|
||||
<path class="path1" d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-4v4h-2v-4h-4v-2h4v-4h2v4h4v2z"></path>
|
||||
<symbol id="pro" viewBox="0 0 20 20">
|
||||
<path d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-4v4h-2v-4h-4v-2h4v-4h2v4h4v2z"></path>
|
||||
</symbol>
|
||||
<symbol id="icon-con" viewBox="0 0 20 20">
|
||||
<title>con</title>
|
||||
<path class="path1" d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-10v-2h10v2z"></path>
|
||||
<symbol id="con" viewBox="0 0 20 20">
|
||||
<path d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-10v-2h10v2z"></path>
|
||||
</symbol>
|
||||
<symbol id="icon-neutral" viewBox="0 0 20 20">
|
||||
<title>neutral</title>
|
||||
<path class="path1" d="M9.999 0.8c-5.081 0-9.199 4.119-9.199 9.201 0 5.080 4.118 9.199 9.199 9.199s9.2-4.119 9.2-9.199c0-5.082-4.119-9.201-9.2-9.201zM10 13.001c-1.657 0-3-1.344-3-3s1.343-3 3-3c1.656 0 3 1.344 3 3s-1.344 3-3 3z"></path>
|
||||
<symbol id="neutral" viewBox="0 0 20 20">
|
||||
<path d="M9.999 0.8c-5.081 0-9.199 4.119-9.199 9.201 0 5.080 4.118 9.199 9.199 9.199s9.2-4.119 9.2-9.199c0-5.082-4.119-9.201-9.2-9.201zM10 13.001c-1.657 0-3-1.344-3-3s1.343-3 3-3c1.656 0 3 1.344 3 3s-1.344 3-3 3z"></path>
|
||||
</symbol>
|
||||
<symbol id="chat" viewBox="0 0 24 24">
|
||||
<path d="M18 8.016v-2.016h-12v2.016h12zM18 11.016v-2.016h-12v2.016h12zM18 14.016v-2.016h-12v2.016h12zM21.984 3.984v18l-3.984-3.984h-14.016c-1.078 0-1.969-0.938-1.969-2.016v-12c0-1.078 0.891-1.969 1.969-1.969h16.031c1.078 0 1.969 0.891 1.969 1.969z"></path>
|
||||
</symbol>
|
||||
</defs>
|
||||
</svg>
|
||||
|
|
Before Width: | Height: | Size: 4.7 KiB After Width: | Height: | Size: 4.6 KiB |
|
@ -23,7 +23,7 @@ p
|
|||
|
||||
+row
|
||||
+cell Multi-language support
|
||||
each icon in [ "con", "pro", "pro", "pro" ]
|
||||
each icon in [ "neutral", "pro", "pro", "pro" ]
|
||||
+cell.u-text-center #[+procon(icon)]
|
||||
|
||||
+row
|
||||
|
|
|
@ -2,8 +2,6 @@
|
|||
|
||||
include ../_includes/_mixins
|
||||
|
||||
p=lorem_short
|
||||
|
||||
+aside("Help us improve the docs")
|
||||
| Did you spot a mistake or come across explanations that
|
||||
| are unclear? You can find a "Suggest edits" button at the
|
||||
|
|
|
@ -57,7 +57,7 @@ p
|
|||
doc.ents = [Span(0, 1, label='GPE')]
|
||||
assert doc[0].ent_type_ == 'GPE'
|
||||
doc.ents = []
|
||||
doc.ents = [(u'LondonCity', 0, 1, u'GPE')]
|
||||
doc.ents = [(u'LondonCity', u'GPE', 0, 1)]
|
||||
|
||||
p
|
||||
| The value you assign should be a sequence, the values of which
|
||||
|
|
|
@ -30,6 +30,13 @@ p Many of the associated tools and resources that we're developing alongside spa
|
|||
+cell
|
||||
| REST microservices for spaCy demos and visualisers.
|
||||
|
||||
+row
|
||||
+cell
|
||||
+src(gh("spacy-notebooks")) spaCy Notebooks
|
||||
|
||||
+cell
|
||||
| Jupyter notebooks for spaCy examples and tutorials.
|
||||
|
||||
+h(2, "libraries") Libraries and projects
|
||||
+table(["Name", "Description"])
|
||||
+row
|
||||
|
|
|
@ -141,7 +141,7 @@ p
|
|||
span.merge(label=label, tag='NNP' if label else span.root.tag_)
|
||||
|
||||
matcher.add_entity('GoogleNow', on_match=merge_phrases)
|
||||
matcher.add_pattern('GoogleNow', {ORTH: 'Google'}, {ORTH: 'Now'}])
|
||||
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
|
||||
doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded'])
|
||||
matcher(doc)
|
||||
print([w.text for w in doc])
|
||||
|
|
Loading…
Reference in New Issue
Block a user