mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'master' of ssh://github.com/explosion/spaCy
This commit is contained in:
commit
3eb6a929f3
|
@ -33,6 +33,7 @@ We use the following system to tag our issues:
|
||||||
| [`install`](https://github.com/explosion/spaCy/labels/install) | Installation problems |
|
| [`install`](https://github.com/explosion/spaCy/labels/install) | Installation problems |
|
||||||
| [`performance`](https://github.com/explosion/spaCy/labels/performance) | Accuracy, speed and memory use problems |
|
| [`performance`](https://github.com/explosion/spaCy/labels/performance) | Accuracy, speed and memory use problems |
|
||||||
| [`tests`](https://github.com/explosion/spaCy/labels/tests) | Missing or incorrect [tests](spacy/tests) |
|
| [`tests`](https://github.com/explosion/spaCy/labels/tests) | Missing or incorrect [tests](spacy/tests) |
|
||||||
|
| [`english`](https://github.com/explosion/spaCy/labels/english), [`german`](https://github.com/explosion/spaCy/labels/german) | Issues related to the specific languages, models and data |
|
||||||
| [`linux`](https://github.com/explosion/spaCy/labels/linux), [`osx`](https://github.com/explosion/spaCy/labels/osx), [`windows`](https://github.com/explosion/spaCy/labels/windows) | Issues related to the specific operating systems |
|
| [`linux`](https://github.com/explosion/spaCy/labels/linux), [`osx`](https://github.com/explosion/spaCy/labels/osx), [`windows`](https://github.com/explosion/spaCy/labels/windows) | Issues related to the specific operating systems |
|
||||||
| [`pip`](https://github.com/explosion/spaCy/labels/pip), [`conda`](https://github.com/explosion/spaCy/labels/conda) | Issues related to the specific package managers |
|
| [`pip`](https://github.com/explosion/spaCy/labels/pip), [`conda`](https://github.com/explosion/spaCy/labels/conda) | Issues related to the specific package managers |
|
||||||
| [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before |
|
| [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before |
|
||||||
|
|
|
@ -3,8 +3,10 @@ spaCy: Industrial-strength NLP
|
||||||
|
|
||||||
spaCy is a library for advanced natural language processing in Python and
|
spaCy is a library for advanced natural language processing in Python and
|
||||||
Cython. spaCy is built on the very latest research, but it isn't researchware.
|
Cython. spaCy is built on the very latest research, but it isn't researchware.
|
||||||
It was designed from day 1 to be used in real products. It's commercial
|
It was designed from day one to be used in real products. spaCy currently supports
|
||||||
open-source software, released under the MIT license.
|
English and German, as well as tokenization for Chinese, Spanish, Italian, French,
|
||||||
|
Portuguese, Dutch, Swedish and Hungarian. It's commercial open-source software,
|
||||||
|
released under the MIT license.
|
||||||
|
|
||||||
💫 **Version 1.5 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
💫 **Version 1.5 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
||||||
|
|
||||||
|
@ -24,7 +26,7 @@ open-source software, released under the MIT license.
|
||||||
:target: https://pypi.python.org/pypi/spacy
|
:target: https://pypi.python.org/pypi/spacy
|
||||||
:alt: pypi Version
|
:alt: pypi Version
|
||||||
|
|
||||||
.. image:: https://badges.gitter.im/spaCy-users.png
|
.. image:: https://badges.gitter.im/explosion.png
|
||||||
:target: https://gitter.im/explosion/spaCy
|
:target: https://gitter.im/explosion/spaCy
|
||||||
:alt: spaCy on Gitter
|
:alt: spaCy on Gitter
|
||||||
|
|
||||||
|
|
|
@ -71,6 +71,8 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc):
|
||||||
features = get_templates('basic')
|
features = get_templates('basic')
|
||||||
|
|
||||||
model_dir = pathlib.Path(model_dir)
|
model_dir = pathlib.Path(model_dir)
|
||||||
|
if not (model_dir / 'deps').exists():
|
||||||
|
(model_dir / 'deps').mkdir()
|
||||||
with (model_dir / 'deps' / 'config.json').open('w') as file_:
|
with (model_dir / 'deps' / 'config.json').open('w') as file_:
|
||||||
json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_)
|
json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_)
|
||||||
|
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -47,8 +47,7 @@ PACKAGES = [
|
||||||
'spacy.tests.tokenizer',
|
'spacy.tests.tokenizer',
|
||||||
'spacy.tests.tokens',
|
'spacy.tests.tokens',
|
||||||
'spacy.tests.vectors',
|
'spacy.tests.vectors',
|
||||||
'spacy.tests.vocab',
|
'spacy.tests.vocab']
|
||||||
'spacy.tests.website']
|
|
||||||
|
|
||||||
|
|
||||||
MOD_NAMES = [
|
MOD_NAMES = [
|
||||||
|
|
|
@ -9,12 +9,13 @@ from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
|
||||||
TAG_MAP = dict(TAG_MAP)
|
TAG_MAP = dict(TAG_MAP)
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -516,11 +516,6 @@ TOKENIZER_EXCEPTIONS = {
|
||||||
|
|
||||||
|
|
||||||
ORTH_ONLY = [
|
ORTH_ONLY = [
|
||||||
"'",
|
|
||||||
"\\\")",
|
|
||||||
"<space>",
|
|
||||||
"a.",
|
|
||||||
"ä.",
|
|
||||||
"A.C.",
|
"A.C.",
|
||||||
"a.D.",
|
"a.D.",
|
||||||
"A.D.",
|
"A.D.",
|
||||||
|
@ -530,24 +525,20 @@ ORTH_ONLY = [
|
||||||
"Abs.",
|
"Abs.",
|
||||||
"adv.",
|
"adv.",
|
||||||
"al.",
|
"al.",
|
||||||
"b.",
|
|
||||||
"B.A.",
|
"B.A.",
|
||||||
"B.Sc.",
|
"B.Sc.",
|
||||||
"betr.",
|
"betr.",
|
||||||
"biol.",
|
"biol.",
|
||||||
"Biol.",
|
"Biol.",
|
||||||
"c.",
|
|
||||||
"ca.",
|
"ca.",
|
||||||
"Chr.",
|
"Chr.",
|
||||||
"Cie.",
|
"Cie.",
|
||||||
"co.",
|
"co.",
|
||||||
"Co.",
|
"Co.",
|
||||||
"d.",
|
|
||||||
"D.C.",
|
"D.C.",
|
||||||
"Dipl.-Ing.",
|
"Dipl.-Ing.",
|
||||||
"Dipl.",
|
"Dipl.",
|
||||||
"Dr.",
|
"Dr.",
|
||||||
"e.",
|
|
||||||
"e.g.",
|
"e.g.",
|
||||||
"e.V.",
|
"e.V.",
|
||||||
"ehem.",
|
"ehem.",
|
||||||
|
@ -555,79 +546,57 @@ ORTH_ONLY = [
|
||||||
"erm.",
|
"erm.",
|
||||||
"etc.",
|
"etc.",
|
||||||
"ev.",
|
"ev.",
|
||||||
"f.",
|
|
||||||
"g.",
|
|
||||||
"G.m.b.H.",
|
"G.m.b.H.",
|
||||||
"geb.",
|
"geb.",
|
||||||
"Gebr.",
|
"Gebr.",
|
||||||
"gem.",
|
"gem.",
|
||||||
"h.",
|
|
||||||
"h.c.",
|
"h.c.",
|
||||||
"Hg.",
|
"Hg.",
|
||||||
"hrsg.",
|
"hrsg.",
|
||||||
"Hrsg.",
|
"Hrsg.",
|
||||||
"i.",
|
|
||||||
"i.A.",
|
"i.A.",
|
||||||
"i.e.",
|
"i.e.",
|
||||||
"i.G.",
|
"i.G.",
|
||||||
"i.Tr.",
|
"i.Tr.",
|
||||||
"i.V.",
|
"i.V.",
|
||||||
"Ing.",
|
"Ing.",
|
||||||
"j.",
|
|
||||||
"jr.",
|
"jr.",
|
||||||
"Jr.",
|
"Jr.",
|
||||||
"jun.",
|
"jun.",
|
||||||
"jur.",
|
"jur.",
|
||||||
"k.",
|
|
||||||
"K.O.",
|
"K.O.",
|
||||||
"l.",
|
|
||||||
"L.A.",
|
"L.A.",
|
||||||
"lat.",
|
"lat.",
|
||||||
"m.",
|
|
||||||
"M.A.",
|
"M.A.",
|
||||||
"m.E.",
|
"m.E.",
|
||||||
"m.M.",
|
"m.M.",
|
||||||
"M.Sc.",
|
"M.Sc.",
|
||||||
"Mr.",
|
"Mr.",
|
||||||
"n.",
|
|
||||||
"N.Y.",
|
"N.Y.",
|
||||||
"N.Y.C.",
|
"N.Y.C.",
|
||||||
"nat.",
|
"nat.",
|
||||||
"ö."
|
"ö."
|
||||||
"o.",
|
|
||||||
"o.a.",
|
"o.a.",
|
||||||
"o.ä.",
|
"o.ä.",
|
||||||
"o.g.",
|
"o.g.",
|
||||||
"o.k.",
|
"o.k.",
|
||||||
"O.K.",
|
"O.K.",
|
||||||
"p.",
|
|
||||||
"p.a.",
|
"p.a.",
|
||||||
"p.s.",
|
"p.s.",
|
||||||
"P.S.",
|
"P.S.",
|
||||||
"pers.",
|
"pers.",
|
||||||
"phil.",
|
"phil.",
|
||||||
"q.",
|
|
||||||
"q.e.d.",
|
"q.e.d.",
|
||||||
"r.",
|
|
||||||
"R.I.P.",
|
"R.I.P.",
|
||||||
"rer.",
|
"rer.",
|
||||||
"s.",
|
|
||||||
"sen.",
|
"sen.",
|
||||||
"St.",
|
"St.",
|
||||||
"std.",
|
"std.",
|
||||||
"t.",
|
|
||||||
"u.",
|
|
||||||
"ü.",
|
|
||||||
"u.a.",
|
"u.a.",
|
||||||
"U.S.",
|
"U.S.",
|
||||||
"U.S.A.",
|
"U.S.A.",
|
||||||
"U.S.S.",
|
"U.S.S.",
|
||||||
"v.",
|
|
||||||
"Vol.",
|
"Vol.",
|
||||||
"vs.",
|
"vs.",
|
||||||
"w.",
|
"wiss."
|
||||||
"wiss.",
|
|
||||||
"x.",
|
|
||||||
"y.",
|
|
||||||
"z."
|
|
||||||
]
|
]
|
||||||
|
|
|
@ -37,14 +37,16 @@ def get_time_exc(hours):
|
||||||
return exc
|
return exc
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
|
||||||
TAG_MAP = dict(TAG_MAP)
|
TAG_MAP = dict(TAG_MAP)
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
|
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
|
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"]
|
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"]
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -40,11 +40,14 @@ def get_time_exc(hours):
|
||||||
return exc
|
return exc
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
|
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||||
|
|
|
@ -85,55 +85,29 @@ TOKENIZER_EXCEPTIONS = {
|
||||||
|
|
||||||
|
|
||||||
ORTH_ONLY = [
|
ORTH_ONLY = [
|
||||||
"a.",
|
|
||||||
"a.C.",
|
"a.C.",
|
||||||
"a.J.C.",
|
"a.J.C.",
|
||||||
"apdo.",
|
"apdo.",
|
||||||
"Av.",
|
"Av.",
|
||||||
"Avda.",
|
"Avda.",
|
||||||
"b.",
|
|
||||||
"c.",
|
|
||||||
"Cía.",
|
"Cía.",
|
||||||
"d.",
|
|
||||||
"e.",
|
|
||||||
"etc.",
|
"etc.",
|
||||||
"f.",
|
|
||||||
"g.",
|
|
||||||
"Gob.",
|
"Gob.",
|
||||||
"Gral.",
|
"Gral.",
|
||||||
"h.",
|
|
||||||
"i.",
|
|
||||||
"Ing.",
|
"Ing.",
|
||||||
"j.",
|
|
||||||
"J.C.",
|
"J.C.",
|
||||||
"k.",
|
|
||||||
"l.",
|
|
||||||
"Lic.",
|
"Lic.",
|
||||||
"m.",
|
|
||||||
"m.n.",
|
"m.n.",
|
||||||
"n.",
|
|
||||||
"no.",
|
"no.",
|
||||||
"núm.",
|
"núm.",
|
||||||
"o.",
|
|
||||||
"p.",
|
|
||||||
"P.D.",
|
"P.D.",
|
||||||
"Prof.",
|
"Prof.",
|
||||||
"Profa.",
|
"Profa.",
|
||||||
"q.",
|
|
||||||
"q.e.p.d."
|
"q.e.p.d."
|
||||||
"r.",
|
|
||||||
"s.",
|
|
||||||
"S.A.",
|
"S.A.",
|
||||||
"S.L.",
|
"S.L.",
|
||||||
"s.s.s.",
|
"s.s.s.",
|
||||||
"Sr.",
|
"Sr.",
|
||||||
"Sra.",
|
"Sra.",
|
||||||
"Srta.",
|
"Srta."
|
||||||
"t.",
|
|
||||||
"u.",
|
|
||||||
"v.",
|
|
||||||
"w.",
|
|
||||||
"x.",
|
|
||||||
"y.",
|
|
||||||
"z."
|
|
||||||
]
|
]
|
||||||
|
|
|
@ -2,13 +2,16 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .. import language_data as base
|
from .. import language_data as base
|
||||||
from ..language_data import strings_to_exc
|
from ..language_data import strings_to_exc, update_exc
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||||
|
|
|
@ -4,21 +4,25 @@ from __future__ import unicode_literals
|
||||||
import six
|
import six
|
||||||
|
|
||||||
from spacy.language_data import strings_to_exc, update_exc
|
from spacy.language_data import strings_to_exc, update_exc
|
||||||
from .punctuations import *
|
from .punctuation import *
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import ABBREVIATIONS
|
from .tokenizer_exceptions import ABBREVIATIONS
|
||||||
from .tokenizer_exceptions import OTHER_EXC
|
from .tokenizer_exceptions import OTHER_EXC
|
||||||
from .. import language_data as base
|
from .. import language_data as base
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||||
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
|
|
||||||
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
|
||||||
|
|
||||||
# HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]
|
|
||||||
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES
|
||||||
|
TOKENIZER_SUFFIXES = base.TOKENIZER_SUFFIXES + TOKENIZER_SUFFIXES
|
||||||
|
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
||||||
|
|
25
spacy/hu/punctuation.py
Normal file
25
spacy/hu/punctuation.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..language_data.punctuation import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_SUFFIXES = [
|
||||||
|
r'(?<=[{al})])-e'.format(al=ALPHA_LOWER)
|
||||||
|
]
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = [
|
||||||
|
r'(?<=[0-9])-(?=[0-9])',
|
||||||
|
r'(?<=[0-9])[+\-\*/^](?=[0-9])',
|
||||||
|
r'(?<=[{a}])--(?=[{a}])',
|
||||||
|
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||||
|
r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
|
r'(?<=[0-9{a}])"(?=[\-{a}])'.format(a=ALPHA),
|
||||||
|
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES += LIST_ELLIPSES
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
|
@ -1,89 +0,0 @@
|
||||||
# encoding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = r'''
|
|
||||||
+
|
|
||||||
'''.strip().split('\n')
|
|
||||||
|
|
||||||
TOKENIZER_SUFFIXES = r'''
|
|
||||||
,
|
|
||||||
\"
|
|
||||||
\)
|
|
||||||
\]
|
|
||||||
\}
|
|
||||||
\*
|
|
||||||
\!
|
|
||||||
\?
|
|
||||||
\$
|
|
||||||
>
|
|
||||||
:
|
|
||||||
;
|
|
||||||
'
|
|
||||||
”
|
|
||||||
“
|
|
||||||
«
|
|
||||||
_
|
|
||||||
''
|
|
||||||
’
|
|
||||||
‘
|
|
||||||
€
|
|
||||||
\.\.
|
|
||||||
\.\.\.
|
|
||||||
\.\.\.\.
|
|
||||||
(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”+-])\.
|
|
||||||
(?<=[a-züóőúéáűí)])-e
|
|
||||||
\-\-
|
|
||||||
´
|
|
||||||
(?<=[0-9])\+
|
|
||||||
(?<=[a-z0-9üóőúéáűí][\)\]”"'%\)§/])\.
|
|
||||||
(?<=[0-9])km²
|
|
||||||
(?<=[0-9])m²
|
|
||||||
(?<=[0-9])cm²
|
|
||||||
(?<=[0-9])mm²
|
|
||||||
(?<=[0-9])km³
|
|
||||||
(?<=[0-9])m³
|
|
||||||
(?<=[0-9])cm³
|
|
||||||
(?<=[0-9])mm³
|
|
||||||
(?<=[0-9])ha
|
|
||||||
(?<=[0-9])km
|
|
||||||
(?<=[0-9])m
|
|
||||||
(?<=[0-9])cm
|
|
||||||
(?<=[0-9])mm
|
|
||||||
(?<=[0-9])µm
|
|
||||||
(?<=[0-9])nm
|
|
||||||
(?<=[0-9])yd
|
|
||||||
(?<=[0-9])in
|
|
||||||
(?<=[0-9])ft
|
|
||||||
(?<=[0-9])kg
|
|
||||||
(?<=[0-9])g
|
|
||||||
(?<=[0-9])mg
|
|
||||||
(?<=[0-9])µg
|
|
||||||
(?<=[0-9])t
|
|
||||||
(?<=[0-9])lb
|
|
||||||
(?<=[0-9])oz
|
|
||||||
(?<=[0-9])m/s
|
|
||||||
(?<=[0-9])km/h
|
|
||||||
(?<=[0-9])mph
|
|
||||||
(?<=°[FCK])\.
|
|
||||||
(?<=[0-9])hPa
|
|
||||||
(?<=[0-9])Pa
|
|
||||||
(?<=[0-9])mbar
|
|
||||||
(?<=[0-9])mb
|
|
||||||
(?<=[0-9])T
|
|
||||||
(?<=[0-9])G
|
|
||||||
(?<=[0-9])M
|
|
||||||
(?<=[0-9])K
|
|
||||||
(?<=[0-9])kb
|
|
||||||
'''.strip().split('\n')
|
|
||||||
|
|
||||||
TOKENIZER_INFIXES = r'''
|
|
||||||
…
|
|
||||||
\.\.+
|
|
||||||
(?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ])
|
|
||||||
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
|
|
||||||
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
|
|
||||||
(?<=[0-9])[+\-\*/^](?=[0-9])
|
|
||||||
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
|
|
||||||
'''.strip().split('\n')
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
|
|
@ -111,7 +111,6 @@ Vcs.
|
||||||
Vhr.
|
Vhr.
|
||||||
X.Y.
|
X.Y.
|
||||||
Zs.
|
Zs.
|
||||||
a.
|
|
||||||
a.C.
|
a.C.
|
||||||
ac.
|
ac.
|
||||||
adj.
|
adj.
|
||||||
|
@ -126,7 +125,6 @@ ang.
|
||||||
arch.
|
arch.
|
||||||
at.
|
at.
|
||||||
aug.
|
aug.
|
||||||
b.
|
|
||||||
b.a.
|
b.a.
|
||||||
b.s.
|
b.s.
|
||||||
b.sc.
|
b.sc.
|
||||||
|
@ -141,7 +139,6 @@ br.
|
||||||
bsc.
|
bsc.
|
||||||
bt.
|
bt.
|
||||||
btk.
|
btk.
|
||||||
c.
|
|
||||||
ca.
|
ca.
|
||||||
cc.
|
cc.
|
||||||
cca.
|
cca.
|
||||||
|
@ -155,7 +152,6 @@ csc.
|
||||||
csüt.
|
csüt.
|
||||||
cső.
|
cső.
|
||||||
ctv.
|
ctv.
|
||||||
d.
|
|
||||||
dbj.
|
dbj.
|
||||||
dd.
|
dd.
|
||||||
ddr.
|
ddr.
|
||||||
|
@ -170,7 +166,6 @@ dolg.
|
||||||
dr.
|
dr.
|
||||||
du.
|
du.
|
||||||
dzs.
|
dzs.
|
||||||
e.
|
|
||||||
ea.
|
ea.
|
||||||
ed.
|
ed.
|
||||||
eff.
|
eff.
|
||||||
|
@ -186,7 +181,6 @@ etc.
|
||||||
ev.
|
ev.
|
||||||
ezr.
|
ezr.
|
||||||
eü.
|
eü.
|
||||||
f.
|
|
||||||
f.h.
|
f.h.
|
||||||
f.é.
|
f.é.
|
||||||
fam.
|
fam.
|
||||||
|
@ -213,7 +207,6 @@ főig.
|
||||||
főisk.
|
főisk.
|
||||||
főtörm.
|
főtörm.
|
||||||
főv.
|
főv.
|
||||||
g.
|
|
||||||
gazd.
|
gazd.
|
||||||
gimn.
|
gimn.
|
||||||
gk.
|
gk.
|
||||||
|
@ -225,7 +218,6 @@ gy.
|
||||||
gyak.
|
gyak.
|
||||||
gyártm.
|
gyártm.
|
||||||
gör.
|
gör.
|
||||||
h.
|
|
||||||
hads.
|
hads.
|
||||||
hallg.
|
hallg.
|
||||||
hdm.
|
hdm.
|
||||||
|
@ -266,7 +258,6 @@ isk.
|
||||||
ism.
|
ism.
|
||||||
izr.
|
izr.
|
||||||
iá.
|
iá.
|
||||||
j.
|
|
||||||
jan.
|
jan.
|
||||||
jav.
|
jav.
|
||||||
jegyz.
|
jegyz.
|
||||||
|
@ -278,7 +269,6 @@ jr.
|
||||||
jvb.
|
jvb.
|
||||||
júl.
|
júl.
|
||||||
jún.
|
jún.
|
||||||
k.
|
|
||||||
karb.
|
karb.
|
||||||
kat.
|
kat.
|
||||||
kb.
|
kb.
|
||||||
|
@ -313,7 +303,6 @@ közl.
|
||||||
közp.
|
közp.
|
||||||
közt.
|
közt.
|
||||||
kü.
|
kü.
|
||||||
l.
|
|
||||||
lat.
|
lat.
|
||||||
ld.
|
ld.
|
||||||
legs.
|
legs.
|
||||||
|
@ -324,7 +313,6 @@ lt.
|
||||||
ltd.
|
ltd.
|
||||||
ltp.
|
ltp.
|
||||||
luth.
|
luth.
|
||||||
m.
|
|
||||||
m.a.
|
m.a.
|
||||||
m.s.
|
m.s.
|
||||||
m.sc.
|
m.sc.
|
||||||
|
@ -359,7 +347,6 @@ műh.
|
||||||
műsz.
|
műsz.
|
||||||
műv.
|
műv.
|
||||||
művez.
|
művez.
|
||||||
n.
|
|
||||||
nagyker.
|
nagyker.
|
||||||
nagys.
|
nagys.
|
||||||
nat.
|
nat.
|
||||||
|
@ -372,7 +359,6 @@ ny.
|
||||||
nyilv.
|
nyilv.
|
||||||
nyrt.
|
nyrt.
|
||||||
nyug.
|
nyug.
|
||||||
o.
|
|
||||||
obj.
|
obj.
|
||||||
okl.
|
okl.
|
||||||
okt.
|
okt.
|
||||||
|
@ -381,7 +367,6 @@ orsz.
|
||||||
ort.
|
ort.
|
||||||
ov.
|
ov.
|
||||||
ovh.
|
ovh.
|
||||||
p.
|
|
||||||
pf.
|
pf.
|
||||||
pg.
|
pg.
|
||||||
ph.d
|
ph.d
|
||||||
|
@ -404,8 +389,6 @@ pság.
|
||||||
ptk.
|
ptk.
|
||||||
pu.
|
pu.
|
||||||
pü.
|
pü.
|
||||||
q.
|
|
||||||
r.
|
|
||||||
r.k.
|
r.k.
|
||||||
rac.
|
rac.
|
||||||
rad.
|
rad.
|
||||||
|
@ -420,7 +403,6 @@ rkt.
|
||||||
rt.
|
rt.
|
||||||
rtg.
|
rtg.
|
||||||
röv.
|
röv.
|
||||||
s.
|
|
||||||
s.b.
|
s.b.
|
||||||
s.k.
|
s.k.
|
||||||
sa.
|
sa.
|
||||||
|
@ -450,7 +432,6 @@ szt.
|
||||||
szubj.
|
szubj.
|
||||||
szöv.
|
szöv.
|
||||||
szül.
|
szül.
|
||||||
t.
|
|
||||||
tanm.
|
tanm.
|
||||||
tb.
|
tb.
|
||||||
tbk.
|
tbk.
|
||||||
|
@ -476,13 +457,11 @@ tvr.
|
||||||
ty.
|
ty.
|
||||||
törv.
|
törv.
|
||||||
tü.
|
tü.
|
||||||
u.
|
|
||||||
ua.
|
ua.
|
||||||
ui.
|
ui.
|
||||||
unit.
|
unit.
|
||||||
uo.
|
uo.
|
||||||
uv.
|
uv.
|
||||||
v.
|
|
||||||
vas.
|
vas.
|
||||||
vb.
|
vb.
|
||||||
vegy.
|
vegy.
|
||||||
|
@ -501,9 +480,6 @@ vv.
|
||||||
vál.
|
vál.
|
||||||
vízv.
|
vízv.
|
||||||
vö.
|
vö.
|
||||||
w.
|
|
||||||
y.
|
|
||||||
z.
|
|
||||||
zrt.
|
zrt.
|
||||||
zs.
|
zs.
|
||||||
Ész.
|
Ész.
|
||||||
|
@ -520,7 +496,6 @@ zs.
|
||||||
évf.
|
évf.
|
||||||
í.
|
í.
|
||||||
ó.
|
ó.
|
||||||
ö.
|
|
||||||
össz.
|
össz.
|
||||||
ötk.
|
ötk.
|
||||||
özv.
|
özv.
|
||||||
|
@ -528,7 +503,6 @@ zs.
|
||||||
úm.
|
úm.
|
||||||
ún.
|
ún.
|
||||||
út.
|
út.
|
||||||
ü.
|
|
||||||
üag.
|
üag.
|
||||||
üd.
|
üd.
|
||||||
üdv.
|
üdv.
|
||||||
|
@ -544,6 +518,5 @@ zs.
|
||||||
""".strip().split()
|
""".strip().split()
|
||||||
|
|
||||||
OTHER_EXC = """
|
OTHER_EXC = """
|
||||||
''
|
|
||||||
-e
|
-e
|
||||||
""".strip().split()
|
""".strip().split()
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
|
|
||||||
|
|
|
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from .abbreviations import *
|
||||||
from .emoticons import *
|
from .emoticons import *
|
||||||
from .punctuation import *
|
from .punctuation import *
|
||||||
from .tag_map import *
|
from .tag_map import *
|
||||||
|
|
43
spacy/language_data/abbreviations.py
Normal file
43
spacy/language_data/abbreviations.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
ABBREVIATIONS = [
|
||||||
|
"'",
|
||||||
|
"\\\")",
|
||||||
|
"<space>",
|
||||||
|
"''",
|
||||||
|
"C++",
|
||||||
|
"a.",
|
||||||
|
"b.",
|
||||||
|
"c.",
|
||||||
|
"d.",
|
||||||
|
"e.",
|
||||||
|
"f.",
|
||||||
|
"g.",
|
||||||
|
"h.",
|
||||||
|
"i.",
|
||||||
|
"j.",
|
||||||
|
"k.",
|
||||||
|
"l.",
|
||||||
|
"m.",
|
||||||
|
"n.",
|
||||||
|
"o.",
|
||||||
|
"p.",
|
||||||
|
"q.",
|
||||||
|
"r.",
|
||||||
|
"s.",
|
||||||
|
"t.",
|
||||||
|
"u.",
|
||||||
|
"v.",
|
||||||
|
"w.",
|
||||||
|
"x.",
|
||||||
|
"y.",
|
||||||
|
"z.",
|
||||||
|
"ä.",
|
||||||
|
"ö.",
|
||||||
|
"ü."
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [ "ABBREVIATIONS" ]
|
|
@ -13,6 +13,7 @@ EMOTICONS = set("""
|
||||||
(-:
|
(-:
|
||||||
=)
|
=)
|
||||||
(=
|
(=
|
||||||
|
")
|
||||||
:]
|
:]
|
||||||
:-]
|
:-]
|
||||||
[:
|
[:
|
||||||
|
|
|
@ -1,133 +1,115 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import re
|
||||||
TOKENIZER_PREFIXES = r'''
|
|
||||||
,
|
|
||||||
"
|
|
||||||
(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
*
|
|
||||||
<
|
|
||||||
>
|
|
||||||
$
|
|
||||||
£
|
|
||||||
¡
|
|
||||||
¿
|
|
||||||
„
|
|
||||||
“
|
|
||||||
'
|
|
||||||
``
|
|
||||||
`
|
|
||||||
#
|
|
||||||
‘
|
|
||||||
....
|
|
||||||
...
|
|
||||||
…
|
|
||||||
‚
|
|
||||||
»
|
|
||||||
§
|
|
||||||
US$
|
|
||||||
C$
|
|
||||||
A$
|
|
||||||
a-
|
|
||||||
'''.strip().split('\n')
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_SUFFIXES = r'''
|
_ALPHA_LOWER = """
|
||||||
,
|
a ä à á â ǎ æ ã å ā ă ą b c ç ć č ĉ ċ c̄ d ð ď e é è ê ë ė ȅ ȩ ẽ ę f g ĝ ğ h i ı
|
||||||
\"
|
î ï í ī ì ȉ ǐ į ĩ j k ķ l ł ļ m n ñ ń ň ņ o ö ó ò ő ô õ œ ø ō ő ǒ ơ p q r ř ŗ s
|
||||||
\)
|
ß ś š ş ŝ t ť u ú û ù ú ū ű ǔ ů ų ư v w ŵ x y ÿ ý ỳ ŷ ỹ z ź ž ż þ
|
||||||
\]
|
"""
|
||||||
\}
|
|
||||||
\*
|
|
||||||
\!
|
|
||||||
\?
|
|
||||||
%
|
|
||||||
\$
|
|
||||||
>
|
|
||||||
:
|
|
||||||
;
|
|
||||||
'
|
|
||||||
”
|
|
||||||
“
|
|
||||||
«
|
|
||||||
_
|
|
||||||
''
|
|
||||||
's
|
|
||||||
'S
|
|
||||||
’s
|
|
||||||
’S
|
|
||||||
’
|
|
||||||
‘
|
|
||||||
°
|
|
||||||
€
|
|
||||||
…
|
|
||||||
\.\.
|
|
||||||
\.\.\.
|
|
||||||
\.\.\.\.
|
|
||||||
(?<=[a-z0-9)\]”"'%\)])\.
|
|
||||||
(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
|
|
||||||
\-\-
|
|
||||||
´
|
|
||||||
(?<=[0-9])km²
|
|
||||||
(?<=[0-9])m²
|
|
||||||
(?<=[0-9])cm²
|
|
||||||
(?<=[0-9])mm²
|
|
||||||
(?<=[0-9])km³
|
|
||||||
(?<=[0-9])m³
|
|
||||||
(?<=[0-9])cm³
|
|
||||||
(?<=[0-9])mm³
|
|
||||||
(?<=[0-9])ha
|
|
||||||
(?<=[0-9])km
|
|
||||||
(?<=[0-9])m
|
|
||||||
(?<=[0-9])cm
|
|
||||||
(?<=[0-9])mm
|
|
||||||
(?<=[0-9])µm
|
|
||||||
(?<=[0-9])nm
|
|
||||||
(?<=[0-9])yd
|
|
||||||
(?<=[0-9])in
|
|
||||||
(?<=[0-9])ft
|
|
||||||
(?<=[0-9])kg
|
|
||||||
(?<=[0-9])g
|
|
||||||
(?<=[0-9])mg
|
|
||||||
(?<=[0-9])µg
|
|
||||||
(?<=[0-9])t
|
|
||||||
(?<=[0-9])lb
|
|
||||||
(?<=[0-9])oz
|
|
||||||
(?<=[0-9])m/s
|
|
||||||
(?<=[0-9])km/h
|
|
||||||
(?<=[0-9])mph
|
|
||||||
(?<=[0-9])°C
|
|
||||||
(?<=[0-9])°K
|
|
||||||
(?<=[0-9])°F
|
|
||||||
(?<=[0-9])hPa
|
|
||||||
(?<=[0-9])Pa
|
|
||||||
(?<=[0-9])mbar
|
|
||||||
(?<=[0-9])mb
|
|
||||||
(?<=[0-9])T
|
|
||||||
(?<=[0-9])G
|
|
||||||
(?<=[0-9])M
|
|
||||||
(?<=[0-9])K
|
|
||||||
(?<=[0-9])kb
|
|
||||||
'''.strip().split('\n')
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_INFIXES = r'''
|
_ALPHA_UPPER = """
|
||||||
…
|
A Ä À Á Â Ǎ Æ Ã Å Ā Ă Ą B C Ç Ć Č Ĉ Ċ C̄ D Ð Ď E É È Ê Ë Ė Ȅ Ȩ Ẽ Ę F G Ĝ Ğ H I İ
|
||||||
\.\.\.+
|
Î Ï Í Ī Ì Ȉ Ǐ Į Ĩ J K Ķ L Ł Ļ M N Ñ Ń Ň Ņ O Ö Ó Ò Ő Ô Õ Œ Ø Ō Ő Ǒ Ơ P Q R Ř Ŗ S
|
||||||
(?<=[a-z])\.(?=[A-Z])
|
Ś Š Ş Ŝ T Ť U Ú Û Ù Ú Ū Ű Ǔ Ů Ų Ư V W Ŵ X Y Ÿ Ý Ỳ Ŷ Ỹ Z Ź Ž Ż Þ
|
||||||
(?<=[a-z])\.(?=[A-Z])
|
"""
|
||||||
(?<=[a-zA-Z])-(?=[a-zA-z])
|
|
||||||
(?<=[a-zA-Z])--(?=[a-zA-z])
|
|
||||||
(?<=[0-9])-(?=[0-9])
|
_UNITS = """
|
||||||
(?<=[A-Za-z]),(?=[A-Za-z])
|
km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft kg g mg
|
||||||
(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
|
µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb
|
||||||
(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
|
TB T G M K
|
||||||
(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
|
"""
|
||||||
(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
|
|
||||||
'''.strip().split('\n')
|
|
||||||
|
_CURRENCY = r"""
|
||||||
|
\$ £ € ¥ ฿ US\$ C\$ A\$
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
_QUOTES = r"""
|
||||||
|
' '' " ” “ `` ` ‘ ´ ‚ , „ » «
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
_PUNCT = r"""
|
||||||
|
… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
_HYPHENS = r"""
|
||||||
|
- – — -- ---
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
LIST_ELLIPSES = [
|
||||||
|
r'\.\.+',
|
||||||
|
"…"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
LIST_CURRENCY = list(_CURRENCY.strip().split())
|
||||||
|
LIST_QUOTES = list(_QUOTES.strip().split())
|
||||||
|
LIST_PUNCT = list(_PUNCT.strip().split())
|
||||||
|
LIST_HYPHENS = list(_HYPHENS.strip().split())
|
||||||
|
|
||||||
|
|
||||||
|
ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '')
|
||||||
|
ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '')
|
||||||
|
ALPHA = ALPHA_LOWER + ALPHA_UPPER
|
||||||
|
|
||||||
|
|
||||||
|
QUOTES = _QUOTES.strip().replace(' ', '|')
|
||||||
|
CURRENCY = _CURRENCY.strip().replace(' ', '|')
|
||||||
|
UNITS = _UNITS.strip().replace(' ', '|')
|
||||||
|
HYPHENS = _HYPHENS.strip().replace(' ', '|')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Prefixes
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = (
|
||||||
|
['§', '%', r'\+'] +
|
||||||
|
LIST_PUNCT +
|
||||||
|
LIST_ELLIPSES +
|
||||||
|
LIST_QUOTES +
|
||||||
|
LIST_CURRENCY
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Suffixes
|
||||||
|
|
||||||
|
TOKENIZER_SUFFIXES = (
|
||||||
|
LIST_PUNCT +
|
||||||
|
LIST_ELLIPSES +
|
||||||
|
LIST_QUOTES +
|
||||||
|
[
|
||||||
|
r'(?<=[0-9])\+',
|
||||||
|
r'(?<=°[FfCcKk])\.',
|
||||||
|
r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
|
||||||
|
r'(?<=[0-9])(?:{u})'.format(u=UNITS),
|
||||||
|
r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES),
|
||||||
|
r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER),
|
||||||
|
"'s", "'S", "’s", "’S"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Infixes
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = (
|
||||||
|
LIST_ELLIPSES +
|
||||||
|
[
|
||||||
|
r'(?<=[0-9])[+\-\*/^](?=[0-9])',
|
||||||
|
r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
|
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||||
|
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
||||||
|
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
||||||
|
|
|
@ -20,5 +20,6 @@ TAG_MAP = {
|
||||||
"X": {POS: X},
|
"X": {POS: X},
|
||||||
"CONJ": {POS: CONJ},
|
"CONJ": {POS: CONJ},
|
||||||
"ADJ": {POS: ADJ},
|
"ADJ": {POS: ADJ},
|
||||||
"VERB": {POS: VERB}
|
"VERB": {POS: VERB},
|
||||||
|
"PART": {POS: PART}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
from .language_data import *
|
from .language_data import *
|
||||||
|
|
|
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
|
|
||||||
|
|
|
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
from .language_data import *
|
from .language_data import *
|
||||||
|
|
|
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||||
|
|
11
spacy/tests/de/conftest.py
Normal file
11
spacy/tests/de/conftest.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ...de import German
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def de_tokenizer():
|
||||||
|
return German.Defaults.create_tokenizer()
|
0
spacy/tests/de/tokenizer/__init__.py
Normal file
0
spacy/tests/de/tokenizer/__init__.py
Normal file
27
spacy/tests/de/tokenizer/test_exceptions.py
Normal file
27
spacy/tests/de/tokenizer/test_exceptions.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Test that tokenizer exceptions and emoticons are handles correctly."""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
|
||||||
|
def test_tokenizer_splits_contractions(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
|
||||||
|
def test_tokenizer_handles_abbr(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_handles_exc_in_text(de_tokenizer):
|
||||||
|
text = "Ich bin z.Zt. im Urlaub."
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 6
|
||||||
|
assert tokens[2].text == "z.Zt."
|
||||||
|
assert tokens[2].lemma_ == "zur Zeit"
|
116
spacy/tests/de/tokenizer/test_prefix_suffix_infix.py
Normal file
116
spacy/tests/de/tokenizer/test_prefix_suffix_infix.py
Normal file
|
@ -0,0 +1,116 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(unter)"])
|
||||||
|
def test_tokenizer_splits_no_special(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["unter'm"])
|
||||||
|
def test_tokenizer_splits_no_punct(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(unter'm"])
|
||||||
|
def test_tokenizer_splits_prefix_punct(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["unter'm)"])
|
||||||
|
def test_tokenizer_splits_suffix_punct(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(unter'm)"])
|
||||||
|
def test_tokenizer_splits_even_wrap(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(unter'm?)"])
|
||||||
|
def test_tokenizer_splits_uneven_wrap(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 5
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,length', [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
|
||||||
|
def test_tokenizer_splits_prefix_interact(de_tokenizer, text, length):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["z.B.)"])
|
||||||
|
def test_tokenizer_splits_suffix_interact(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(z.B.)"])
|
||||||
|
def test_tokenizer_splits_even_wrap_interact(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(z.B.?)"])
|
||||||
|
def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["blau-rot"])
|
||||||
|
def test_tokenizer_splits_hyphens(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||||
|
def test_tokenizer_splits_numeric_range(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["blau.Rot", "Hallo.Welt"])
|
||||||
|
def test_tokenizer_splits_period_infix(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["Hallo,Welt", "eins,zwei"])
|
||||||
|
def test_tokenizer_splits_comma_infix(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[0].text == text.split(",")[0]
|
||||||
|
assert tokens[1].text == ","
|
||||||
|
assert tokens[2].text == text.split(",")[1]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["blau...Rot", "blau...rot"])
|
||||||
|
def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_splits_double_hyphen_infix(de_tokenizer):
|
||||||
|
tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.")
|
||||||
|
assert len(tokens) == 12
|
||||||
|
assert tokens[0].text == "Viele"
|
||||||
|
assert tokens[1].text == "Regeln"
|
||||||
|
assert tokens[2].text == "--"
|
||||||
|
assert tokens[3].text == "wie"
|
||||||
|
assert tokens[4].text == "die"
|
||||||
|
assert tokens[5].text == "Bindestrich"
|
||||||
|
assert tokens[6].text == "-"
|
||||||
|
assert tokens[7].text == "Regeln"
|
||||||
|
assert tokens[8].text == "--"
|
||||||
|
assert tokens[9].text == "sind"
|
||||||
|
assert tokens[10].text == "kompliziert"
|
45
spacy/tests/de/tokenizer/test_text.py
Normal file
45
spacy/tests/de/tokenizer/test_text.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Test that longer and mixed texts are tokenized correctly."""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_handles_long_text(de_tokenizer):
|
||||||
|
text = """Die Verwandlung
|
||||||
|
|
||||||
|
Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte, fand er sich in
|
||||||
|
seinem Bett zu einem ungeheueren Ungeziefer verwandelt.
|
||||||
|
|
||||||
|
Er lag auf seinem panzerartig harten Rücken und sah, wenn er den Kopf ein wenig
|
||||||
|
hob, seinen gewölbten, braunen, von bogenförmigen Versteifungen geteilten
|
||||||
|
Bauch, auf dessen Höhe sich die Bettdecke, zum gänzlichen Niedergleiten bereit,
|
||||||
|
kaum noch erhalten konnte. Seine vielen, im Vergleich zu seinem sonstigen
|
||||||
|
Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
|
||||||
|
|
||||||
|
»Was ist mit mir geschehen?«, dachte er."""
|
||||||
|
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 109
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,length', [
|
||||||
|
("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1),
|
||||||
|
("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1),
|
||||||
|
("Kraftfahrzeug-Haftpflichtversicherung", 3),
|
||||||
|
("Vakuum-Mittelfrequenz-Induktionsofen", 5)
|
||||||
|
])
|
||||||
|
def test_tokenizer_handles_long_words(de_tokenizer, text, length):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,length', [
|
||||||
|
("»Was ist mit mir geschehen?«, dachte er.", 12),
|
||||||
|
("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15)
|
||||||
|
])
|
||||||
|
def test_tokenizer_handles_examples(de_tokenizer, text, length):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == length
|
0
spacy/tests/en/__init__.py
Normal file
0
spacy/tests/en/__init__.py
Normal file
11
spacy/tests/en/conftest.py
Normal file
11
spacy/tests/en/conftest.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ...en import English
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def en_tokenizer():
|
||||||
|
return English.Defaults.create_tokenizer()
|
0
spacy/tests/en/tokenizer/__init__.py
Normal file
0
spacy/tests/en/tokenizer/__init__.py
Normal file
87
spacy/tests/en/tokenizer/test_contractions.py
Normal file
87
spacy/tests/en/tokenizer/test_contractions.py
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Test that tokens are created correctly for contractions."""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_handles_basic_contraction(en_tokenizer):
|
||||||
|
text = "don't giggle"
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[1].text == "n't"
|
||||||
|
text = "i said don't!"
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 5
|
||||||
|
assert tokens[4].text == "!"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
|
||||||
|
def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
|
||||||
|
def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
||||||
|
tokens = en_tokenizer(text_poss)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert tokens[0].text == text
|
||||||
|
assert tokens[1].text == "'s"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
|
||||||
|
def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert tokens[0].text == text.split("'")[0]
|
||||||
|
assert tokens[1].text == "'"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
|
||||||
|
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
assert tokens[0].text == text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
|
||||||
|
def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert tokens[0].text == text.split("'")[0]
|
||||||
|
assert tokens[1].text == "'ll"
|
||||||
|
assert tokens[1].lemma_ == "will"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
|
||||||
|
def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
|
||||||
|
tokens_lower = en_tokenizer(text_lower)
|
||||||
|
tokens_title = en_tokenizer(text_title)
|
||||||
|
assert tokens_title[0].text == tokens_lower[0].text.title()
|
||||||
|
assert tokens_lower[0].text == tokens_title[0].text.lower()
|
||||||
|
assert tokens_lower[1].text == tokens_title[1].text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
|
||||||
|
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
|
||||||
|
def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
|
||||||
|
tokens = en_tokenizer(pron + contraction)
|
||||||
|
assert tokens[0].text == pron
|
||||||
|
assert tokens[1].text == contraction
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
|
||||||
|
def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
|
||||||
|
tokens = en_tokenizer(exc)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
|
||||||
|
def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
||||||
|
tokens = en_tokenizer(wo_punct)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
tokens = en_tokenizer(w_punct)
|
||||||
|
assert len(tokens) == 3
|
20
spacy/tests/en/tokenizer/test_exceptions.py
Normal file
20
spacy/tests/en/tokenizer/test_exceptions.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Test that tokenizer exceptions are handled correctly."""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
|
||||||
|
def test_tokenizer_handles_abbr(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_handles_exc_in_text(en_tokenizer):
|
||||||
|
text = "It's mediocre i.e. bad."
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 6
|
||||||
|
assert tokens[3].text == "i.e."
|
|
@ -1,12 +1,14 @@
|
||||||
|
# coding: utf-8
|
||||||
"""Test that token.idx correctly computes index into the original string."""
|
"""Test that token.idx correctly computes index into the original string."""
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_simple_punct(en_tokenizer):
|
def test_simple_punct(en_tokenizer):
|
||||||
text = 'to walk, do foo'
|
text = "to walk, do foo"
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert tokens[0].idx == 0
|
assert tokens[0].idx == 0
|
||||||
assert tokens[1].idx == 3
|
assert tokens[1].idx == 3
|
||||||
|
@ -16,7 +18,7 @@ def test_simple_punct(en_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
def test_complex_punct(en_tokenizer):
|
def test_complex_punct(en_tokenizer):
|
||||||
text = 'Tom (D., Ill.)!'
|
text = "Tom (D., Ill.)!"
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert tokens[0].idx == 0
|
assert tokens[0].idx == 0
|
||||||
assert len(tokens[0]) == 3
|
assert len(tokens[0]) == 3
|
136
spacy/tests/en/tokenizer/test_prefix_suffix_infix.py
Normal file
136
spacy/tests/en/tokenizer/test_prefix_suffix_infix.py
Normal file
|
@ -0,0 +1,136 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(can)"])
|
||||||
|
def test_tokenizer_splits_no_special(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["can't"])
|
||||||
|
def test_tokenizer_splits_no_punct(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(can't"])
|
||||||
|
def test_tokenizer_splits_prefix_punct(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["can't)"])
|
||||||
|
def test_tokenizer_splits_suffix_punct(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(can't)"])
|
||||||
|
def test_tokenizer_splits_even_wrap(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(can't?)"])
|
||||||
|
def test_tokenizer_splits_uneven_wrap(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 5
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,length', [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
|
||||||
|
def test_tokenizer_splits_prefix_interact(en_tokenizer, text, length):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["U.S.)"])
|
||||||
|
def test_tokenizer_splits_suffix_interact(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(U.S.)"])
|
||||||
|
def test_tokenizer_splits_even_wrap_interact(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(U.S.?)"])
|
||||||
|
def test_tokenizer_splits_uneven_wrap_interact(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["best-known"])
|
||||||
|
def test_tokenizer_splits_hyphens(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||||
|
def test_tokenizer_splits_numeric_range(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["best.Known", "Hello.World"])
|
||||||
|
def test_tokenizer_splits_period_infix(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["Hello,world", "one,two"])
|
||||||
|
def test_tokenizer_splits_comma_infix(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[0].text == text.split(",")[0]
|
||||||
|
assert tokens[1].text == ","
|
||||||
|
assert tokens[2].text == text.split(",")[1]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["best...Known", "best...known"])
|
||||||
|
def test_tokenizer_splits_ellipsis_infix(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_splits_double_hyphen_infix(en_tokenizer):
|
||||||
|
tokens = en_tokenizer("No decent--let alone well-bred--people.")
|
||||||
|
assert tokens[0].text == "No"
|
||||||
|
assert tokens[1].text == "decent"
|
||||||
|
assert tokens[2].text == "--"
|
||||||
|
assert tokens[3].text == "let"
|
||||||
|
assert tokens[4].text == "alone"
|
||||||
|
assert tokens[5].text == "well"
|
||||||
|
assert tokens[6].text == "-"
|
||||||
|
assert tokens[7].text == "bred"
|
||||||
|
assert tokens[8].text == "--"
|
||||||
|
assert tokens[9].text == "people"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_tokenizer_splits_period_abbr(en_tokenizer):
|
||||||
|
text = "Today is Tuesday.Mr."
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 5
|
||||||
|
assert tokens[0].text == "Today"
|
||||||
|
assert tokens[1].text == "is"
|
||||||
|
assert tokens[2].text == "Tuesday"
|
||||||
|
assert tokens[3].text == "."
|
||||||
|
assert tokens[4].text == "Mr."
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_tokenizer_splits_em_dash_infix(en_tokenizer):
|
||||||
|
# Re Issue #225
|
||||||
|
tokens = en_tokenizer("""Will this road take me to Puddleton?\u2014No, """
|
||||||
|
"""you'll have to walk there.\u2014Ariel.""")
|
||||||
|
assert tokens[6].text == "Puddleton"
|
||||||
|
assert tokens[7].text == "?"
|
||||||
|
assert tokens[8].text == "\u2014"
|
132
spacy/tests/en/tokenizer/test_punct.py
Normal file
132
spacy/tests/en/tokenizer/test_punct.py
Normal file
|
@ -0,0 +1,132 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Test that open, closed and paired punctuation is split off correctly."""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ....util import compile_prefix_regex
|
||||||
|
from ....language_data import TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||||
|
|
||||||
|
PUNCT_OPEN = ['(', '[', '{', '*']
|
||||||
|
PUNCT_CLOSE = [')', ']', '}', '*']
|
||||||
|
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
||||||
|
def test_tokenizer_handles_only_punct(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == len(text)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||||
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
|
def test_tokenizer_splits_open_punct(en_tokenizer, punct, text):
|
||||||
|
tokens = en_tokenizer(punct + text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert tokens[0].text == punct
|
||||||
|
assert tokens[1].text == text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||||
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
|
def test_tokenizer_splits_close_punct(en_tokenizer, punct, text):
|
||||||
|
tokens = en_tokenizer(text + punct)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert tokens[0].text == text
|
||||||
|
assert tokens[1].text == punct
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||||
|
@pytest.mark.parametrize('punct_add', ["`"])
|
||||||
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
|
def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
|
||||||
|
tokens = en_tokenizer(punct + punct_add + text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[0].text == punct
|
||||||
|
assert tokens[1].text == punct_add
|
||||||
|
assert tokens[2].text == text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||||
|
@pytest.mark.parametrize('punct_add', ["'"])
|
||||||
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
|
def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
|
||||||
|
tokens = en_tokenizer(text + punct + punct_add)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[0].text == text
|
||||||
|
assert tokens[1].text == punct
|
||||||
|
assert tokens[2].text == punct_add
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||||
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
|
def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
|
||||||
|
tokens = en_tokenizer(punct + punct + punct + text)
|
||||||
|
assert len(tokens) == 4
|
||||||
|
assert tokens[0].text == punct
|
||||||
|
assert tokens[3].text == text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||||
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
|
def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
|
||||||
|
tokens = en_tokenizer(text + punct + punct + punct)
|
||||||
|
assert len(tokens) == 4
|
||||||
|
assert tokens[0].text == text
|
||||||
|
assert tokens[1].text == punct
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["'The"])
|
||||||
|
def test_tokenizer_splits_open_appostrophe(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert tokens[0].text == "'"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["Hello''"])
|
||||||
|
def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
tokens_punct = en_tokenizer("''")
|
||||||
|
assert len(tokens_punct) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||||
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
|
def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text):
|
||||||
|
tokens = en_tokenizer(punct_open + text + punct_close)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[0].text == punct_open
|
||||||
|
assert tokens[1].text == text
|
||||||
|
assert tokens[2].text == punct_close
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||||
|
@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")])
|
||||||
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
|
def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text):
|
||||||
|
tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add)
|
||||||
|
assert len(tokens) == 5
|
||||||
|
assert tokens[0].text == punct_open_add
|
||||||
|
assert tokens[1].text == punct_open
|
||||||
|
assert tokens[2].text == text
|
||||||
|
assert tokens[3].text == punct_close
|
||||||
|
assert tokens[4].text == punct_close_add
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
||||||
|
def test_tokenizer_splits_pre_punct_regex(text, punct):
|
||||||
|
match = en_search_prefixes(text)
|
||||||
|
assert match.group() == punct
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_splits_bracket_period(en_tokenizer):
|
||||||
|
text = "(And a 6a.m. run through Washington Park)."
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert tokens[len(tokens) - 1].text == "."
|
36
spacy/tests/en/tokenizer/test_text.py
Normal file
36
spacy/tests/en/tokenizer/test_text.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Test that longer and mixed texts are tokenized correctly."""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_handles_long_text(en_tokenizer):
|
||||||
|
text = """Tributes pour in for late British Labour Party leader
|
||||||
|
|
||||||
|
Tributes poured in from around the world Thursday
|
||||||
|
to the late Labour Party leader John Smith, who died earlier from a massive
|
||||||
|
heart attack aged 55.
|
||||||
|
|
||||||
|
In Washington, the US State Department issued a statement regretting "the
|
||||||
|
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
||||||
|
|
||||||
|
"Mr. Smith, throughout his distinguished"""
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 76
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,length', [
|
||||||
|
("The U.S. Army likes Shock and Awe.", 8),
|
||||||
|
("U.N. regulations are not a part of their concern.", 10),
|
||||||
|
("“Isn't it?”", 6),
|
||||||
|
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
|
||||||
|
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
||||||
|
("They ran about 10km.", 6),
|
||||||
|
# ("But then the 6,000-year ice age came...", 10)
|
||||||
|
])
|
||||||
|
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == length
|
11
spacy/tests/hu/conftest.py
Normal file
11
spacy/tests/hu/conftest.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ...hu import Hungarian
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def hu_tokenizer():
|
||||||
|
return Hungarian.Defaults.create_tokenizer()
|
|
@ -2,25 +2,27 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.hu import Hungarian
|
|
||||||
|
|
||||||
_DEFAULT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
|
||||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
|
||||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
|
||||||
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
|
||||||
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
|
||||||
('A .hu.', ['A', '.hu', '.']),
|
|
||||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
|
||||||
('A pl.', ['A', 'pl.']),
|
|
||||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
|
||||||
('Egy..ket.', ['Egy', '..', 'ket', '.']),
|
|
||||||
('Valami... van.', ['Valami', '...', 'van', '.']),
|
|
||||||
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
|
||||||
('Valami...', ['Valami', '...']),
|
|
||||||
('Valami ...', ['Valami', '...']),
|
|
||||||
('Valami ... más.', ['Valami', '...', 'más', '.'])]
|
|
||||||
|
|
||||||
_HYPHEN_TESTS = [
|
DEFAULT_TESTS = [
|
||||||
|
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||||
|
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
||||||
|
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||||
|
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
||||||
|
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
||||||
|
('A .hu.', ['A', '.hu', '.']),
|
||||||
|
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||||
|
('A pl.', ['A', 'pl.']),
|
||||||
|
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||||
|
('Egy..ket.', ['Egy', '..', 'ket', '.']),
|
||||||
|
('Valami... van.', ['Valami', '...', 'van', '.']),
|
||||||
|
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
||||||
|
('Valami...', ['Valami', '...']),
|
||||||
|
('Valami ...', ['Valami', '...']),
|
||||||
|
('Valami ... más.', ['Valami', '...', 'más', '.'])
|
||||||
|
]
|
||||||
|
|
||||||
|
HYPHEN_TESTS = [
|
||||||
('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']),
|
('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']),
|
||||||
('Egy -nak.', ['Egy', '-nak', '.']),
|
('Egy -nak.', ['Egy', '-nak', '.']),
|
||||||
('Egy bel-.', ['Egy', 'bel-', '.']),
|
('Egy bel-.', ['Egy', 'bel-', '.']),
|
||||||
|
@ -39,195 +41,194 @@ _HYPHEN_TESTS = [
|
||||||
('A 7-es.', ['A', '7-es', '.']),
|
('A 7-es.', ['A', '7-es', '.']),
|
||||||
('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']),
|
('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']),
|
||||||
('A %-sal.', ['A', '%-sal', '.']),
|
('A %-sal.', ['A', '%-sal', '.']),
|
||||||
('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])]
|
('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])
|
||||||
|
]
|
||||||
|
|
||||||
_NUMBER_TESTS = [('A 2b van.', ['A', '2b', 'van', '.']),
|
NUMBER_TESTS = [
|
||||||
('A 2b-ben van.', ['A', '2b-ben', 'van', '.']),
|
('A 2b van.', ['A', '2b', 'van', '.']),
|
||||||
('A 2b.', ['A', '2b', '.']),
|
('A 2b-ben van.', ['A', '2b-ben', 'van', '.']),
|
||||||
('A 2b-ben.', ['A', '2b-ben', '.']),
|
('A 2b.', ['A', '2b', '.']),
|
||||||
('A 3.b van.', ['A', '3.b', 'van', '.']),
|
('A 2b-ben.', ['A', '2b-ben', '.']),
|
||||||
('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']),
|
('A 3.b van.', ['A', '3.b', 'van', '.']),
|
||||||
('A 3.b.', ['A', '3.b', '.']),
|
('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']),
|
||||||
('A 3.b-ben.', ['A', '3.b-ben', '.']),
|
('A 3.b.', ['A', '3.b', '.']),
|
||||||
('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']),
|
('A 3.b-ben.', ['A', '3.b-ben', '.']),
|
||||||
('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']),
|
('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']),
|
||||||
('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']),
|
('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']),
|
||||||
('A 1:35 van.', ['A', '1:35', 'van', '.']),
|
('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']),
|
||||||
('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']),
|
('A 1:35 van.', ['A', '1:35', 'van', '.']),
|
||||||
('A 1:35-ben.', ['A', '1:35-ben', '.']),
|
('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']),
|
||||||
('A 1.35 van.', ['A', '1.35', 'van', '.']),
|
('A 1:35-ben.', ['A', '1:35-ben', '.']),
|
||||||
('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']),
|
('A 1.35 van.', ['A', '1.35', 'van', '.']),
|
||||||
('A 1.35-ben.', ['A', '1.35-ben', '.']),
|
('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']),
|
||||||
('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']),
|
('A 1.35-ben.', ['A', '1.35-ben', '.']),
|
||||||
('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']),
|
('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']),
|
||||||
('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']),
|
('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']),
|
||||||
('A 10--12 van.', ['A', '10--12', 'van', '.']),
|
('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']),
|
||||||
('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']),
|
('A 10--12 van.', ['A', '10--12', 'van', '.']),
|
||||||
('A 10--12-ben.', ['A', '10--12-ben', '.']),
|
('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']),
|
||||||
('A 10‐12 van.', ['A', '10‐12', 'van', '.']),
|
('A 10--12-ben.', ['A', '10--12-ben', '.']),
|
||||||
('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']),
|
('A 10‐12 van.', ['A', '10‐12', 'van', '.']),
|
||||||
('A 10‐12-ben.', ['A', '10‐12-ben', '.']),
|
('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']),
|
||||||
('A 10‑12 van.', ['A', '10‑12', 'van', '.']),
|
('A 10‐12-ben.', ['A', '10‐12-ben', '.']),
|
||||||
('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']),
|
('A 10‑12 van.', ['A', '10‑12', 'van', '.']),
|
||||||
('A 10‑12-ben.', ['A', '10‑12-ben', '.']),
|
('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']),
|
||||||
('A 10‒12 van.', ['A', '10‒12', 'van', '.']),
|
('A 10‑12-ben.', ['A', '10‑12-ben', '.']),
|
||||||
('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']),
|
('A 10‒12 van.', ['A', '10‒12', 'van', '.']),
|
||||||
('A 10‒12-ben.', ['A', '10‒12-ben', '.']),
|
('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']),
|
||||||
('A 10–12 van.', ['A', '10–12', 'van', '.']),
|
('A 10‒12-ben.', ['A', '10‒12-ben', '.']),
|
||||||
('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']),
|
('A 10–12 van.', ['A', '10–12', 'van', '.']),
|
||||||
('A 10–12-ben.', ['A', '10–12-ben', '.']),
|
('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']),
|
||||||
('A 10—12 van.', ['A', '10—12', 'van', '.']),
|
('A 10–12-ben.', ['A', '10–12-ben', '.']),
|
||||||
('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']),
|
('A 10—12 van.', ['A', '10—12', 'van', '.']),
|
||||||
('A 10—12-ben.', ['A', '10—12-ben', '.']),
|
('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']),
|
||||||
('A 10―12 van.', ['A', '10―12', 'van', '.']),
|
('A 10—12-ben.', ['A', '10—12-ben', '.']),
|
||||||
('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']),
|
('A 10―12 van.', ['A', '10―12', 'van', '.']),
|
||||||
('A 10―12-ben.', ['A', '10―12-ben', '.']),
|
('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']),
|
||||||
('A -23,12 van.', ['A', '-23,12', 'van', '.']),
|
('A 10―12-ben.', ['A', '10―12-ben', '.']),
|
||||||
('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']),
|
('A -23,12 van.', ['A', '-23,12', 'van', '.']),
|
||||||
('A -23,12-ben.', ['A', '-23,12-ben', '.']),
|
('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']),
|
||||||
('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']),
|
('A -23,12-ben.', ['A', '-23,12-ben', '.']),
|
||||||
('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']),
|
('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||||
('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']),
|
('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||||
('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']),
|
('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||||
('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']),
|
('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||||
('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']),
|
('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||||
('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']),
|
('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||||
('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']),
|
('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||||
('A C++ van.', ['A', 'C++', 'van', '.']),
|
('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||||
('A C++-ben van.', ['A', 'C++-ben', 'van', '.']),
|
('A C++ van.', ['A', 'C++', 'van', '.']),
|
||||||
('A C++.', ['A', 'C++', '.']),
|
('A C++-ben van.', ['A', 'C++-ben', 'van', '.']),
|
||||||
('A C++-ben.', ['A', 'C++-ben', '.']),
|
('A C++.', ['A', 'C++', '.']),
|
||||||
('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']),
|
('A C++-ben.', ['A', 'C++-ben', '.']),
|
||||||
('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']),
|
('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']),
|
||||||
('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']),
|
('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']),
|
||||||
('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']),
|
('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']),
|
||||||
('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']),
|
('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']),
|
||||||
('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']),
|
('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']),
|
||||||
('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']),
|
('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']),
|
||||||
('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']),
|
('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']),
|
||||||
('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']),
|
('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']),
|
||||||
('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']),
|
('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']),
|
||||||
('A IV. 12.', ['A', 'IV.', '12.']),
|
('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']),
|
||||||
('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']),
|
('A IV. 12.', ['A', 'IV.', '12.']),
|
||||||
('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']),
|
('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']),
|
||||||
('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']),
|
('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']),
|
||||||
('A 2003.01.06.', ['A', '2003.01.06.']),
|
('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']),
|
||||||
('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']),
|
('A 2003.01.06.', ['A', '2003.01.06.']),
|
||||||
('A IV.12. van.', ['A', 'IV.12.', 'van', '.']),
|
('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']),
|
||||||
('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']),
|
('A IV.12. van.', ['A', 'IV.12.', 'van', '.']),
|
||||||
('A IV.12.', ['A', 'IV.12.']),
|
('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']),
|
||||||
('A IV.12-ben.', ['A', 'IV.12-ben', '.']),
|
('A IV.12.', ['A', 'IV.12.']),
|
||||||
('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']),
|
('A IV.12-ben.', ['A', 'IV.12-ben', '.']),
|
||||||
('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']),
|
('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']),
|
||||||
('A 1.1.2.', ['A', '1.1.2.']),
|
('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']),
|
||||||
('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']),
|
('A 1.1.2.', ['A', '1.1.2.']),
|
||||||
('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']),
|
('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']),
|
||||||
('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']),
|
('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']),
|
||||||
('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']),
|
('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']),
|
||||||
('A 3,14 van.', ['A', '3,14', 'van', '.']),
|
('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']),
|
||||||
('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']),
|
('A 3,14 van.', ['A', '3,14', 'van', '.']),
|
||||||
('A 3,14-ben.', ['A', '3,14-ben', '.']),
|
('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']),
|
||||||
('A 3.14 van.', ['A', '3.14', 'van', '.']),
|
('A 3,14-ben.', ['A', '3,14-ben', '.']),
|
||||||
('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']),
|
('A 3.14 van.', ['A', '3.14', 'van', '.']),
|
||||||
('A 3.14-ben.', ['A', '3.14-ben', '.']),
|
('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']),
|
||||||
('A 15. van.', ['A', '15.', 'van', '.']),
|
('A 3.14-ben.', ['A', '3.14-ben', '.']),
|
||||||
('A 15-ben van.', ['A', '15-ben', 'van', '.']),
|
('A 15. van.', ['A', '15.', 'van', '.']),
|
||||||
('A 15-ben.', ['A', '15-ben', '.']),
|
('A 15-ben van.', ['A', '15-ben', 'van', '.']),
|
||||||
('A 15.-ben van.', ['A', '15.-ben', 'van', '.']),
|
('A 15-ben.', ['A', '15-ben', '.']),
|
||||||
('A 15.-ben.', ['A', '15.-ben', '.']),
|
('A 15.-ben van.', ['A', '15.-ben', 'van', '.']),
|
||||||
('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']),
|
('A 15.-ben.', ['A', '15.-ben', '.']),
|
||||||
('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']),
|
('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']),
|
||||||
('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']),
|
('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']),
|
||||||
('A -0,99% van.', ['A', '-0,99%', 'van', '.']),
|
('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']),
|
||||||
('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']),
|
('A -0,99% van.', ['A', '-0,99%', 'van', '.']),
|
||||||
('A -0,99%.', ['A', '-0,99%', '.']),
|
('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']),
|
||||||
('A -0,99%-ben.', ['A', '-0,99%-ben', '.']),
|
('A -0,99%.', ['A', '-0,99%', '.']),
|
||||||
('A 10--20% van.', ['A', '10--20%', 'van', '.']),
|
('A -0,99%-ben.', ['A', '-0,99%-ben', '.']),
|
||||||
('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']),
|
('A 10--20% van.', ['A', '10--20%', 'van', '.']),
|
||||||
('A 10--20%.', ['A', '10--20%', '.']),
|
('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']),
|
||||||
('A 10--20%-ben.', ['A', '10--20%-ben', '.']),
|
('A 10--20%.', ['A', '10--20%', '.']),
|
||||||
('A 99§ van.', ['A', '99§', 'van', '.']),
|
('A 10--20%-ben.', ['A', '10--20%-ben', '.']),
|
||||||
('A 99§-ben van.', ['A', '99§-ben', 'van', '.']),
|
('A 99§ van.', ['A', '99§', 'van', '.']),
|
||||||
('A 99§-ben.', ['A', '99§-ben', '.']),
|
('A 99§-ben van.', ['A', '99§-ben', 'van', '.']),
|
||||||
('A 10--20§ van.', ['A', '10--20§', 'van', '.']),
|
('A 99§-ben.', ['A', '99§-ben', '.']),
|
||||||
('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']),
|
('A 10--20§ van.', ['A', '10--20§', 'van', '.']),
|
||||||
('A 10--20§-ben.', ['A', '10--20§-ben', '.']),
|
('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']),
|
||||||
('A 99° van.', ['A', '99°', 'van', '.']),
|
('A 10--20§-ben.', ['A', '10--20§-ben', '.']),
|
||||||
('A 99°-ben van.', ['A', '99°-ben', 'van', '.']),
|
('A 99° van.', ['A', '99°', 'van', '.']),
|
||||||
('A 99°-ben.', ['A', '99°-ben', '.']),
|
('A 99°-ben van.', ['A', '99°-ben', 'van', '.']),
|
||||||
('A 10--20° van.', ['A', '10--20°', 'van', '.']),
|
('A 99°-ben.', ['A', '99°-ben', '.']),
|
||||||
('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']),
|
('A 10--20° van.', ['A', '10--20°', 'van', '.']),
|
||||||
('A 10--20°-ben.', ['A', '10--20°-ben', '.']),
|
('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']),
|
||||||
('A °C van.', ['A', '°C', 'van', '.']),
|
('A 10--20°-ben.', ['A', '10--20°-ben', '.']),
|
||||||
('A °C-ben van.', ['A', '°C-ben', 'van', '.']),
|
('A °C van.', ['A', '°C', 'van', '.']),
|
||||||
('A °C.', ['A', '°C', '.']),
|
('A °C-ben van.', ['A', '°C-ben', 'van', '.']),
|
||||||
('A °C-ben.', ['A', '°C-ben', '.']),
|
('A °C.', ['A', '°C', '.']),
|
||||||
('A 100°C van.', ['A', '100°C', 'van', '.']),
|
('A °C-ben.', ['A', '°C-ben', '.']),
|
||||||
('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']),
|
('A 100°C van.', ['A', '100°C', 'van', '.']),
|
||||||
('A 100°C.', ['A', '100°C', '.']),
|
('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']),
|
||||||
('A 100°C-ben.', ['A', '100°C-ben', '.']),
|
('A 100°C.', ['A', '100°C', '.']),
|
||||||
('A 800x600 van.', ['A', '800x600', 'van', '.']),
|
('A 100°C-ben.', ['A', '100°C-ben', '.']),
|
||||||
('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']),
|
('A 800x600 van.', ['A', '800x600', 'van', '.']),
|
||||||
('A 800x600-ben.', ['A', '800x600-ben', '.']),
|
('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']),
|
||||||
('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']),
|
('A 800x600-ben.', ['A', '800x600-ben', '.']),
|
||||||
('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']),
|
('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']),
|
||||||
('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']),
|
('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']),
|
||||||
('A 5/J van.', ['A', '5/J', 'van', '.']),
|
('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']),
|
||||||
('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']),
|
('A 5/J van.', ['A', '5/J', 'van', '.']),
|
||||||
('A 5/J-ben.', ['A', '5/J-ben', '.']),
|
('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']),
|
||||||
('A 5/J. van.', ['A', '5/J.', 'van', '.']),
|
('A 5/J-ben.', ['A', '5/J-ben', '.']),
|
||||||
('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']),
|
('A 5/J. van.', ['A', '5/J.', 'van', '.']),
|
||||||
('A 5/J.-ben.', ['A', '5/J.-ben', '.']),
|
('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']),
|
||||||
('A III/1 van.', ['A', 'III/1', 'van', '.']),
|
('A 5/J.-ben.', ['A', '5/J.-ben', '.']),
|
||||||
('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']),
|
('A III/1 van.', ['A', 'III/1', 'van', '.']),
|
||||||
('A III/1-ben.', ['A', 'III/1-ben', '.']),
|
('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']),
|
||||||
('A III/1. van.', ['A', 'III/1.', 'van', '.']),
|
('A III/1-ben.', ['A', 'III/1-ben', '.']),
|
||||||
('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']),
|
('A III/1. van.', ['A', 'III/1.', 'van', '.']),
|
||||||
('A III/1.-ben.', ['A', 'III/1.-ben', '.']),
|
('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']),
|
||||||
('A III/c van.', ['A', 'III/c', 'van', '.']),
|
('A III/1.-ben.', ['A', 'III/1.-ben', '.']),
|
||||||
('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']),
|
('A III/c van.', ['A', 'III/c', 'van', '.']),
|
||||||
('A III/c.', ['A', 'III/c', '.']),
|
('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']),
|
||||||
('A III/c-ben.', ['A', 'III/c-ben', '.']),
|
('A III/c.', ['A', 'III/c', '.']),
|
||||||
('A TU–154 van.', ['A', 'TU–154', 'van', '.']),
|
('A III/c-ben.', ['A', 'III/c-ben', '.']),
|
||||||
('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']),
|
('A TU–154 van.', ['A', 'TU–154', 'van', '.']),
|
||||||
('A TU–154-ben.', ['A', 'TU–154-ben', '.'])]
|
('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']),
|
||||||
|
('A TU–154-ben.', ['A', 'TU–154-ben', '.'])
|
||||||
|
]
|
||||||
|
|
||||||
_QUOTE_TESTS = [('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
|
QUOTE_TESTS = [
|
||||||
('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
|
('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
|
||||||
('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']),
|
('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
|
||||||
('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']),
|
('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']),
|
||||||
("A don't van.", ['A', "don't", 'van', '.'])]
|
('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']),
|
||||||
|
("A don't van.", ['A', "don't", 'van', '.'])
|
||||||
|
]
|
||||||
|
|
||||||
_DOT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
DOT_TESTS = [
|
||||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
||||||
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||||
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
||||||
('A .hu.', ['A', '.hu', '.']),
|
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
||||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
('A .hu.', ['A', '.hu', '.']),
|
||||||
('A pl.', ['A', 'pl.']),
|
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
('A pl.', ['A', 'pl.']),
|
||||||
('Egy..ket.', ['Egy', '..', 'ket', '.']),
|
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||||
('Valami... van.', ['Valami', '...', 'van', '.']),
|
('Egy..ket.', ['Egy', '..', 'ket', '.']),
|
||||||
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
('Valami... van.', ['Valami', '...', 'van', '.']),
|
||||||
('Valami...', ['Valami', '...']),
|
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
||||||
('Valami ...', ['Valami', '...']),
|
('Valami...', ['Valami', '...']),
|
||||||
('Valami ... más.', ['Valami', '...', 'más', '.'])]
|
('Valami ...', ['Valami', '...']),
|
||||||
|
('Valami ... más.', ['Valami', '...', 'más', '.'])
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS # + NUMBER_TESTS + HYPHEN_TESTS
|
||||||
def HU():
|
|
||||||
return Hungarian()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||||
def hu_tokenizer(HU):
|
def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):
|
||||||
return HU.tokenizer
|
tokens = hu_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
|
||||||
@pytest.mark.parametrize(("input", "expected_tokens"),
|
|
||||||
_DEFAULT_TESTS + _HYPHEN_TESTS + _NUMBER_TESTS + _DOT_TESTS + _QUOTE_TESTS)
|
|
||||||
def test_testcases(hu_tokenizer, input, expected_tokens):
|
|
||||||
tokens = hu_tokenizer(input)
|
|
||||||
token_list = [token.orth_ for token in tokens if not token.is_space]
|
|
||||||
assert expected_tokens == token_list
|
assert expected_tokens == token_list
|
||||||
|
|
16
spacy/tests/regression/test_issue351.py
Normal file
16
spacy/tests/regression/test_issue351.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
from ...en import English
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def en_tokenizer():
|
||||||
|
return English.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue351(en_tokenizer):
|
||||||
|
doc = en_tokenizer(" This is a cat.")
|
||||||
|
assert doc[0].idx == 0
|
||||||
|
assert len(doc[0]) == 3
|
||||||
|
assert doc[1].idx == 3
|
14
spacy/tests/regression/test_issue360.py
Normal file
14
spacy/tests/regression/test_issue360.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
from ...en import English
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def en_tokenizer():
|
||||||
|
return English.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
def test_big_ellipsis(en_tokenizer):
|
||||||
|
tokens = en_tokenizer(u'$45...............Asking')
|
||||||
|
assert len(tokens) > 2
|
|
@ -1,4 +0,0 @@
|
||||||
The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ]
|
|
||||||
|
|
||||||
The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ]
|
|
||||||
Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ]
|
|
|
@ -1,7 +1,23 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.en import English
|
|
||||||
|
from ...en import English
|
||||||
|
from ...de import German
|
||||||
|
from ...es import Spanish
|
||||||
|
from ...it import Italian
|
||||||
|
from ...fr import French
|
||||||
|
from ...pt import Portuguese
|
||||||
|
from ...nl import Dutch
|
||||||
|
from ...sv import Swedish
|
||||||
|
from ...hu import Hungarian
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
LANGUAGES = [English, German, Spanish, Italian, French, Dutch, Swedish, Hungarian]
|
||||||
def en_tokenizer(EN):
|
|
||||||
return EN.tokenizer
|
|
||||||
|
@pytest.fixture(params=LANGUAGES)
|
||||||
|
def tokenizer(request):
|
||||||
|
lang = request.param
|
||||||
|
return lang.Defaults.create_tokenizer()
|
||||||
|
|
|
@ -1,58 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def test_possess(en_tokenizer):
|
|
||||||
tokens = en_tokenizer("Mike's")
|
|
||||||
assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike"
|
|
||||||
assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s"
|
|
||||||
assert len(tokens) == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_apostrophe(en_tokenizer):
|
|
||||||
tokens = en_tokenizer("schools'")
|
|
||||||
assert len(tokens) == 2
|
|
||||||
assert tokens[1].orth_ == "'"
|
|
||||||
assert tokens[0].orth_ == "schools"
|
|
||||||
|
|
||||||
|
|
||||||
def test_LL(en_tokenizer):
|
|
||||||
tokens = en_tokenizer("we'll")
|
|
||||||
assert len(tokens) == 2
|
|
||||||
assert tokens[1].orth_ == "'ll"
|
|
||||||
assert tokens[1].lemma_ == "will"
|
|
||||||
assert tokens[0].orth_ == "we"
|
|
||||||
|
|
||||||
|
|
||||||
def test_aint(en_tokenizer):
|
|
||||||
tokens = en_tokenizer("ain't")
|
|
||||||
assert len(tokens) == 2
|
|
||||||
assert tokens[0].orth_ == "ai"
|
|
||||||
assert tokens[0].lemma_ == "be"
|
|
||||||
assert tokens[1].orth_ == "n't"
|
|
||||||
assert tokens[1].lemma_ == "not"
|
|
||||||
|
|
||||||
def test_capitalized(en_tokenizer):
|
|
||||||
tokens = en_tokenizer("can't")
|
|
||||||
assert len(tokens) == 2
|
|
||||||
tokens = en_tokenizer("Can't")
|
|
||||||
assert len(tokens) == 2
|
|
||||||
tokens = en_tokenizer("Ain't")
|
|
||||||
assert len(tokens) == 2
|
|
||||||
assert tokens[0].orth_ == "Ai"
|
|
||||||
assert tokens[0].lemma_ == "be"
|
|
||||||
|
|
||||||
|
|
||||||
def test_punct(en_tokenizer):
|
|
||||||
tokens = en_tokenizer("We've")
|
|
||||||
assert len(tokens) == 2
|
|
||||||
tokens = en_tokenizer("``We've")
|
|
||||||
assert len(tokens) == 3
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_therell(en_tokenizer):
|
|
||||||
tokens = en_tokenizer("there'll")
|
|
||||||
assert len(tokens) == 2
|
|
||||||
assert tokens[0].text == "there"
|
|
||||||
assert tokens[1].text == "there"
|
|
|
@ -1,35 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def test_tweebo_challenge(en_tokenizer):
|
|
||||||
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert tokens[0].orth_ == ":o"
|
|
||||||
assert tokens[1].orth_ == ":/"
|
|
||||||
assert tokens[2].orth_ == ":'("
|
|
||||||
assert tokens[3].orth_ == ">:o"
|
|
||||||
assert tokens[4].orth_ == "(:"
|
|
||||||
assert tokens[5].orth_ == ":)"
|
|
||||||
assert tokens[6].orth_ == ">.<"
|
|
||||||
assert tokens[7].orth_ == "XD"
|
|
||||||
assert tokens[8].orth_ == "-__-"
|
|
||||||
assert tokens[9].orth_ == "o.O"
|
|
||||||
assert tokens[10].orth_ == ";D"
|
|
||||||
assert tokens[11].orth_ == ":-)"
|
|
||||||
assert tokens[12].orth_ == "@_@"
|
|
||||||
assert tokens[13].orth_ == ":P"
|
|
||||||
assert tokens[14].orth_ == "8D"
|
|
||||||
assert tokens[15].orth_ == ":1"
|
|
||||||
assert tokens[16].orth_ == ">:("
|
|
||||||
assert tokens[17].orth_ == ":D"
|
|
||||||
assert tokens[18].orth_ == "=|"
|
|
||||||
assert tokens[19].orth_ == '")'
|
|
||||||
assert tokens[20].orth_ == ':>'
|
|
||||||
assert tokens[21].orth_ == '....'
|
|
||||||
|
|
||||||
|
|
||||||
def test_false_positive(en_tokenizer):
|
|
||||||
text = "example:)"
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 3
|
|
41
spacy/tests/tokenizer/test_exceptions.py
Normal file
41
spacy/tests/tokenizer/test_exceptions.py
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Test that tokenizer exceptions and emoticons are handled correctly."""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_handles_emoticons(tokenizer):
|
||||||
|
# Tweebo challenge (CMU)
|
||||||
|
text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
|
||||||
|
tokens = tokenizer(text)
|
||||||
|
assert tokens[0].text == ":o"
|
||||||
|
assert tokens[1].text == ":/"
|
||||||
|
assert tokens[2].text == ":'("
|
||||||
|
assert tokens[3].text == ">:o"
|
||||||
|
assert tokens[4].text == "(:"
|
||||||
|
assert tokens[5].text == ":)"
|
||||||
|
assert tokens[6].text == ">.<"
|
||||||
|
assert tokens[7].text == "XD"
|
||||||
|
assert tokens[8].text == "-__-"
|
||||||
|
assert tokens[9].text == "o.O"
|
||||||
|
assert tokens[10].text == ";D"
|
||||||
|
assert tokens[11].text == ":-)"
|
||||||
|
assert tokens[12].text == "@_@"
|
||||||
|
assert tokens[13].text == ":P"
|
||||||
|
assert tokens[14].text == "8D"
|
||||||
|
assert tokens[15].text == ":1"
|
||||||
|
assert tokens[16].text == ">:("
|
||||||
|
assert tokens[17].text == ":D"
|
||||||
|
assert tokens[18].text == "=|"
|
||||||
|
assert tokens[19].text == '")'
|
||||||
|
assert tokens[20].text == ':>'
|
||||||
|
assert tokens[21].text == '....'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
|
||||||
|
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
||||||
|
tokens = tokenizer(text)
|
||||||
|
assert len(tokens) == length
|
|
@ -1,62 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
def test_hyphen(en_tokenizer):
|
|
||||||
tokens = en_tokenizer('best-known')
|
|
||||||
assert len(tokens) == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_numeric_range(en_tokenizer):
|
|
||||||
tokens = en_tokenizer('0.1-13.5')
|
|
||||||
assert len(tokens) == 3
|
|
||||||
|
|
||||||
def test_period(en_tokenizer):
|
|
||||||
tokens = en_tokenizer('best.Known')
|
|
||||||
assert len(tokens) == 3
|
|
||||||
tokens = en_tokenizer('zombo.com')
|
|
||||||
assert len(tokens) == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_ellipsis(en_tokenizer):
|
|
||||||
tokens = en_tokenizer('best...Known')
|
|
||||||
assert len(tokens) == 3
|
|
||||||
tokens = en_tokenizer('best...known')
|
|
||||||
assert len(tokens) == 3
|
|
||||||
|
|
||||||
def test_big_ellipsis(en_tokenizer):
|
|
||||||
'''Test regression identified in Issue #360'''
|
|
||||||
tokens = en_tokenizer(u'$45...............Asking')
|
|
||||||
assert len(tokens) > 2
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_email(en_tokenizer):
|
|
||||||
tokens = en_tokenizer('hello@example.com')
|
|
||||||
assert len(tokens) == 1
|
|
||||||
tokens = en_tokenizer('hi+there@gmail.it')
|
|
||||||
assert len(tokens) == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_double_hyphen(en_tokenizer):
|
|
||||||
tokens = en_tokenizer(u'No decent--let alone well-bred--people.')
|
|
||||||
assert tokens[0].text == u'No'
|
|
||||||
assert tokens[1].text == u'decent'
|
|
||||||
assert tokens[2].text == u'--'
|
|
||||||
assert tokens[3].text == u'let'
|
|
||||||
assert tokens[4].text == u'alone'
|
|
||||||
assert tokens[5].text == u'well'
|
|
||||||
assert tokens[6].text == u'-'
|
|
||||||
# TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter
|
|
||||||
# on infixes.
|
|
||||||
assert tokens[7].text == u'bred'
|
|
||||||
assert tokens[8].text == u'--'
|
|
||||||
assert tokens[9].text == u'people'
|
|
||||||
|
|
||||||
|
|
||||||
def test_infix_comma(en_tokenizer):
|
|
||||||
# Re issue #326
|
|
||||||
tokens = en_tokenizer(u'Hello,world')
|
|
||||||
assert tokens[0].text == u'Hello'
|
|
||||||
assert tokens[1].text == u','
|
|
||||||
assert tokens[2].text == u'world'
|
|
|
@ -1,9 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
def test_only_pre1(en_tokenizer):
|
|
||||||
assert len(en_tokenizer("(")) == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_only_pre2(en_tokenizer):
|
|
||||||
assert len(en_tokenizer("((")) == 2
|
|
|
@ -1,43 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def close_puncts():
|
|
||||||
return [')', ']', '}', '*']
|
|
||||||
|
|
||||||
|
|
||||||
def test_close(close_puncts, en_tokenizer):
|
|
||||||
word_str = 'Hello'
|
|
||||||
for p in close_puncts:
|
|
||||||
string = word_str + p
|
|
||||||
tokens = en_tokenizer(string)
|
|
||||||
assert len(tokens) == 2
|
|
||||||
assert tokens[1].string == p
|
|
||||||
assert tokens[0].string == word_str
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_different_close(close_puncts, en_tokenizer):
|
|
||||||
word_str = 'Hello'
|
|
||||||
for p in close_puncts:
|
|
||||||
string = word_str + p + "'"
|
|
||||||
tokens = en_tokenizer(string)
|
|
||||||
assert len(tokens) == 3
|
|
||||||
assert tokens[0].string == word_str
|
|
||||||
assert tokens[1].string == p
|
|
||||||
assert tokens[2].string == "'"
|
|
||||||
|
|
||||||
|
|
||||||
def test_three_same_close(close_puncts, en_tokenizer):
|
|
||||||
word_str = 'Hello'
|
|
||||||
for p in close_puncts:
|
|
||||||
string = word_str + p + p + p
|
|
||||||
tokens = en_tokenizer(string)
|
|
||||||
assert len(tokens) == 4
|
|
||||||
assert tokens[0].string == word_str
|
|
||||||
assert tokens[1].string == p
|
|
||||||
|
|
||||||
|
|
||||||
def test_double_end_quote(en_tokenizer):
|
|
||||||
assert len(en_tokenizer("Hello''")) == 2
|
|
||||||
assert len(en_tokenizer("''")) == 1
|
|
|
@ -1,46 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def open_puncts():
|
|
||||||
return ['(', '[', '{', '*']
|
|
||||||
|
|
||||||
|
|
||||||
def test_open(open_puncts, en_tokenizer):
|
|
||||||
word_str = 'Hello'
|
|
||||||
for p in open_puncts:
|
|
||||||
string = p + word_str
|
|
||||||
tokens = en_tokenizer(string)
|
|
||||||
assert len(tokens) == 2
|
|
||||||
assert tokens[0].orth_ == p
|
|
||||||
assert tokens[1].orth_ == word_str
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_different_open(open_puncts, en_tokenizer):
|
|
||||||
word_str = 'Hello'
|
|
||||||
for p in open_puncts:
|
|
||||||
string = p + "`" + word_str
|
|
||||||
tokens = en_tokenizer(string)
|
|
||||||
assert len(tokens) == 3
|
|
||||||
assert tokens[0].orth_ == p
|
|
||||||
assert tokens[1].orth_ == "`"
|
|
||||||
assert tokens[2].orth_ == word_str
|
|
||||||
|
|
||||||
|
|
||||||
def test_three_same_open(open_puncts, en_tokenizer):
|
|
||||||
word_str = 'Hello'
|
|
||||||
for p in open_puncts:
|
|
||||||
string = p + p + p + word_str
|
|
||||||
tokens = en_tokenizer(string)
|
|
||||||
assert len(tokens) == 4
|
|
||||||
assert tokens[0].orth_ == p
|
|
||||||
assert tokens[3].orth_ == word_str
|
|
||||||
|
|
||||||
|
|
||||||
def test_open_appostrophe(en_tokenizer):
|
|
||||||
string = "'The"
|
|
||||||
tokens = en_tokenizer(string)
|
|
||||||
assert len(tokens) == 2
|
|
||||||
assert tokens[0].orth_ == "'"
|
|
|
@ -1,46 +0,0 @@
|
||||||
"""Test entries in the tokenization special-case interacting with prefix
|
|
||||||
and suffix punctuation."""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def test_no_special(en_tokenizer):
|
|
||||||
assert len(en_tokenizer("(can)")) == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_no_punct(en_tokenizer):
|
|
||||||
assert len(en_tokenizer("can't")) == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_prefix(en_tokenizer):
|
|
||||||
assert len(en_tokenizer("(can't")) == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_suffix(en_tokenizer):
|
|
||||||
assert len(en_tokenizer("can't)")) == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_wrap(en_tokenizer):
|
|
||||||
assert len(en_tokenizer("(can't)")) == 4
|
|
||||||
|
|
||||||
|
|
||||||
def test_uneven_wrap(en_tokenizer):
|
|
||||||
assert len(en_tokenizer("(can't?)")) == 5
|
|
||||||
|
|
||||||
|
|
||||||
def test_prefix_interact(en_tokenizer):
|
|
||||||
assert len(en_tokenizer("U.S.")) == 1
|
|
||||||
assert len(en_tokenizer("us.")) == 2
|
|
||||||
assert len(en_tokenizer("(U.S.")) == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_suffix_interact(en_tokenizer):
|
|
||||||
assert len(en_tokenizer("U.S.)")) == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_even_wrap_interact(en_tokenizer):
|
|
||||||
assert len(en_tokenizer("(U.S.)")) == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_uneven_wrap_interact(en_tokenizer):
|
|
||||||
assert len(en_tokenizer("(U.S.?)")) == 4
|
|
|
@ -1,9 +0,0 @@
|
||||||
"""Test suspected freeing of strings"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
def test_one(en_tokenizer):
|
|
||||||
tokens = en_tokenizer('Betty Botter bought a pound of butter.')
|
|
||||||
assert tokens[0].orth_ == 'Betty'
|
|
||||||
tokens2 = en_tokenizer('Betty also bought a pound of butter.')
|
|
||||||
assert tokens2[0].orth_ == 'Betty'
|
|
|
@ -1,32 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def paired_puncts():
|
|
||||||
return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
|
||||||
|
|
||||||
|
|
||||||
def test_token(paired_puncts, en_tokenizer):
|
|
||||||
word_str = 'Hello'
|
|
||||||
for open_, close_ in paired_puncts:
|
|
||||||
string = open_ + word_str + close_
|
|
||||||
tokens = en_tokenizer(string)
|
|
||||||
assert len(tokens) == 3
|
|
||||||
assert tokens[0].orth_ == open_
|
|
||||||
assert tokens[1].orth_ == word_str
|
|
||||||
assert tokens[2].orth_ == close_
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_different(paired_puncts, en_tokenizer):
|
|
||||||
word_str = 'Hello'
|
|
||||||
for open_, close_ in paired_puncts:
|
|
||||||
string = "`" + open_ + word_str + close_ + "'"
|
|
||||||
tokens = en_tokenizer(string)
|
|
||||||
assert len(tokens) == 5
|
|
||||||
assert tokens[0].orth_ == "`"
|
|
||||||
assert tokens[1].orth_ == open_
|
|
||||||
assert tokens[2].orth_ == word_str
|
|
||||||
assert tokens[2].orth_ == word_str
|
|
||||||
assert tokens[3].orth_ == close_
|
|
||||||
assert tokens[4].orth_ == "'"
|
|
|
@ -1,172 +1,83 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
from os import path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import io
|
|
||||||
import pickle
|
|
||||||
import cloudpickle
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
from ... import util
|
from ...util import utf8open
|
||||||
from ...language_data import TOKENIZER_PREFIXES
|
|
||||||
|
|
||||||
en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
|
|
||||||
|
|
||||||
# @pytest.mark.xfail
|
def test_tokenizer_handles_no_word(tokenizer):
|
||||||
# def test_pickle(en_tokenizer):
|
tokens = tokenizer("")
|
||||||
# file_ = io.BytesIO()
|
|
||||||
# cloudpickle.dump(en_tokenizer, file_)
|
|
||||||
# file_.seek(0)
|
|
||||||
# loaded = pickle.load(file_)
|
|
||||||
# assert loaded is not None
|
|
||||||
|
|
||||||
def test_pre_punct_regex():
|
|
||||||
string = "(can't"
|
|
||||||
match = en_search_prefixes(string)
|
|
||||||
assert match.group() == "("
|
|
||||||
|
|
||||||
def test_no_word(en_tokenizer):
|
|
||||||
tokens = en_tokenizer(u'')
|
|
||||||
assert len(tokens) == 0
|
assert len(tokens) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_single_word(en_tokenizer):
|
@pytest.mark.parametrize('text', ["lorem"])
|
||||||
tokens = en_tokenizer(u'hello')
|
def test_tokenizer_handles_single_word(tokenizer, text):
|
||||||
assert tokens[0].orth_ == 'hello'
|
tokens = tokenizer(text)
|
||||||
|
assert tokens[0].text == text
|
||||||
|
|
||||||
|
|
||||||
def test_two_words(en_tokenizer):
|
def test_tokenizer_handles_punct(tokenizer):
|
||||||
tokens = en_tokenizer('hello possums')
|
text = "Lorem, ipsum."
|
||||||
assert len(tokens) == 2
|
tokens = tokenizer(text)
|
||||||
assert tokens[0].orth_ != tokens[1].orth_
|
|
||||||
|
|
||||||
|
|
||||||
def test_punct(en_tokenizer):
|
|
||||||
tokens = en_tokenizer('hello, possums.')
|
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
assert tokens[0].orth_ == 'hello'
|
assert tokens[0].text == "Lorem"
|
||||||
assert tokens[1].orth_ == ','
|
assert tokens[1].text == ","
|
||||||
assert tokens[2].orth_ == 'possums'
|
assert tokens[2].text == "ipsum"
|
||||||
assert tokens[1].orth_ != 'hello'
|
assert tokens[1].text != "Lorem"
|
||||||
|
|
||||||
|
|
||||||
def test_digits(en_tokenizer):
|
def test_tokenizer_handles_digits(tokenizer):
|
||||||
tokens = en_tokenizer('The year: 1984.')
|
exceptions = ["hu"]
|
||||||
assert len(tokens) == 5
|
text = "Lorem ipsum: 1984."
|
||||||
assert tokens[0].orth == en_tokenizer.vocab['The'].orth
|
tokens = tokenizer(text)
|
||||||
assert tokens[3].orth == en_tokenizer.vocab['1984'].orth
|
|
||||||
|
if tokens[0].lang_ not in exceptions:
|
||||||
|
assert len(tokens) == 5
|
||||||
|
assert tokens[0].text == "Lorem"
|
||||||
|
assert tokens[3].text == "1984"
|
||||||
|
|
||||||
|
|
||||||
def test_contraction(en_tokenizer):
|
@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
|
||||||
tokens = en_tokenizer("don't giggle")
|
def test_tokenizer_keep_urls(tokenizer, text):
|
||||||
assert len(tokens) == 3
|
tokens = tokenizer(text)
|
||||||
assert tokens[1].orth == en_tokenizer.vocab["n't"].orth
|
assert len(tokens) == 1
|
||||||
tokens = en_tokenizer("i said don't!")
|
|
||||||
assert len(tokens) == 5
|
|
||||||
assert tokens[4].orth == en_tokenizer.vocab['!'].orth
|
|
||||||
|
|
||||||
def test_contraction_punct(en_tokenizer):
|
|
||||||
tokens = [w.text for w in en_tokenizer("(can't")]
|
|
||||||
assert tokens == ['(', 'ca', "n't"]
|
|
||||||
tokens = en_tokenizer("`ain't")
|
|
||||||
assert len(tokens) == 3
|
|
||||||
tokens = en_tokenizer('''"isn't''')
|
|
||||||
assert len(tokens) == 3
|
|
||||||
tokens = en_tokenizer("can't!")
|
|
||||||
assert len(tokens) == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_sample(en_tokenizer):
|
@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
|
||||||
text = """Tributes pour in for late British Labour Party leader
|
def test_tokenizer_keeps_email(tokenizer, text):
|
||||||
|
tokens = tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
|
||||||
Tributes poured in from around the world Thursday
|
|
||||||
to the late Labour Party leader John Smith, who died earlier from a massive
|
|
||||||
heart attack aged 55.
|
|
||||||
|
|
||||||
In Washington, the US State Department issued a statement regretting "the
|
def test_tokenizer_handles_long_text(tokenizer):
|
||||||
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit
|
||||||
|
|
||||||
"Mr. Smith, throughout his distinguished"""
|
Cras egestas orci non porttitor maximus.
|
||||||
|
Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate.
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris.
|
||||||
|
|
||||||
|
"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo."""
|
||||||
|
|
||||||
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) > 5
|
assert len(tokens) > 5
|
||||||
|
|
||||||
|
|
||||||
def test_cnts1(en_tokenizer):
|
@pytest.mark.parametrize('file_name', ["sun.txt"])
|
||||||
text = u"""The U.S. Army likes Shock and Awe."""
|
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
|
||||||
tokens = en_tokenizer(text)
|
loc = path.join(path.dirname(__file__), '..', file_name)
|
||||||
assert len(tokens) == 8
|
text = utf8open(loc).read()
|
||||||
|
assert len(text) != 0
|
||||||
|
tokens = tokenizer(text)
|
||||||
|
assert len(tokens) > 100
|
||||||
|
|
||||||
|
|
||||||
def test_cnts2(en_tokenizer):
|
def test_tokenizer_suspected_freeing_strings(tokenizer):
|
||||||
text = u"""U.N. regulations are not a part of their concern."""
|
text1 = "Lorem dolor sit amet, consectetur adipiscing elit."
|
||||||
tokens = en_tokenizer(text)
|
text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
|
||||||
assert len(tokens) == 10
|
tokens1 = tokenizer(text1)
|
||||||
|
tokens2 = tokenizer(text2)
|
||||||
|
assert tokens1[0].text == "Lorem"
|
||||||
def test_cnts3(en_tokenizer):
|
assert tokens2[0].text == "Lorem"
|
||||||
text = u"“Isn't it?”"
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
words = [t.orth_ for t in tokens]
|
|
||||||
assert len(words) == 6
|
|
||||||
|
|
||||||
|
|
||||||
def test_cnts4(en_tokenizer):
|
|
||||||
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
words = [t.orth_ for t in tokens]
|
|
||||||
assert len(words) == 15
|
|
||||||
|
|
||||||
|
|
||||||
def test_cnts5(en_tokenizer):
|
|
||||||
text = """'Me too!', Mr. P. Delaware cried. """
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 11
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_mr(en_tokenizer):
|
|
||||||
text = """Today is Tuesday.Mr."""
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 5
|
|
||||||
assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
|
|
||||||
|
|
||||||
|
|
||||||
def test_cnts6(en_tokenizer):
|
|
||||||
text = u'They ran about 10km.'
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
words = [t.orth_ for t in tokens]
|
|
||||||
assert len(words) == 6
|
|
||||||
|
|
||||||
def test_bracket_period(en_tokenizer):
|
|
||||||
text = u'(And a 6a.m. run through Washington Park).'
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert tokens[len(tokens) - 1].orth_ == u'.'
|
|
||||||
|
|
||||||
|
|
||||||
def test_ie(en_tokenizer):
|
|
||||||
text = u"It's mediocre i.e. bad."
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 6
|
|
||||||
assert tokens[3].orth_ == "i.e."
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_whitespace(en_tokenizer):
|
|
||||||
orig_str = u'there are 2 spaces after this '
|
|
||||||
tokens = en_tokenizer(orig_str)
|
|
||||||
assert repr(tokens.text_with_ws) == repr(orig_str)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_em_dash_infix(en_tokenizer):
|
|
||||||
# Re Issue #225
|
|
||||||
tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, '''
|
|
||||||
'''you'll have to walk there.\u2014Ariel.''')
|
|
||||||
assert tokens[6].text == 'Puddleton'
|
|
||||||
assert tokens[7].text == '?'
|
|
||||||
assert tokens[8].text == '\u2014'
|
|
||||||
|
|
||||||
#def test_cnts7():
|
|
||||||
# text = 'But then the 6,000-year ice age came...'
|
|
||||||
# tokens = EN.tokenize(text)
|
|
||||||
# assert len(tokens) == 10
|
|
||||||
|
|
|
@ -1,67 +1,51 @@
|
||||||
|
# coding: utf-8
|
||||||
"""Test that tokens are created correctly for whitespace."""
|
"""Test that tokens are created correctly for whitespace."""
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_single_space(en_tokenizer):
|
@pytest.mark.parametrize('text', ["lorem ipsum"])
|
||||||
tokens = en_tokenizer('hello possums')
|
def test_tokenizer_splits_single_space(tokenizer, text):
|
||||||
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_double_space(en_tokenizer):
|
@pytest.mark.parametrize('text', ["lorem ipsum"])
|
||||||
tokens = en_tokenizer('hello possums')
|
def test_tokenizer_splits_double_space(tokenizer, text):
|
||||||
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[1].orth_ == ' '
|
assert tokens[1].text == " "
|
||||||
|
|
||||||
|
|
||||||
def test_newline(en_tokenizer):
|
@pytest.mark.parametrize('text', ["lorem ipsum "])
|
||||||
tokens = en_tokenizer('hello\npossums')
|
def test_tokenizer_handles_double_trainling_ws(tokenizer, text):
|
||||||
|
tokens = tokenizer(text)
|
||||||
|
assert repr(tokens.text_with_ws) == repr(text)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["lorem\nipsum"])
|
||||||
|
def test_tokenizer_splits_newline(tokenizer, text):
|
||||||
|
tokens = tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[1].text == "\n"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["lorem \nipsum"])
|
||||||
|
def test_tokenizer_splits_newline_space(tokenizer, text):
|
||||||
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
def test_newline_space(en_tokenizer):
|
@pytest.mark.parametrize('text', ["lorem \nipsum"])
|
||||||
tokens = en_tokenizer('hello \npossums')
|
def test_tokenizer_splits_newline_double_space(tokenizer, text):
|
||||||
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
def test_newline_double_space(en_tokenizer):
|
@pytest.mark.parametrize('text', ["lorem \n ipsum"])
|
||||||
tokens = en_tokenizer('hello \npossums')
|
def test_tokenizer_splits_newline_space_wrap(tokenizer, text):
|
||||||
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
def test_newline_space_wrap(en_tokenizer):
|
|
||||||
tokens = en_tokenizer('hello \n possums')
|
|
||||||
assert len(tokens) == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_leading_space_offsets(en_tokenizer):
|
|
||||||
'''Issue #351
|
|
||||||
# this works
|
|
||||||
|
|
||||||
text1 = u"This is a cat."
|
|
||||||
a = english_spacy(text1)
|
|
||||||
|
|
||||||
tok0 = list(a.sents)[0][0]
|
|
||||||
print tok0, tok0.idx, text1[tok0.idx]
|
|
||||||
|
|
||||||
tok1 = list(a.sents)[0][1]
|
|
||||||
print tok1, tok1.idx, text1[tok1.idx]
|
|
||||||
|
|
||||||
print "=="
|
|
||||||
|
|
||||||
# this does not work
|
|
||||||
|
|
||||||
text2 = u" This is a cat."
|
|
||||||
b = english_spacy(text2)
|
|
||||||
|
|
||||||
tok0 = list(b.sents)[0][0]
|
|
||||||
print tok0, tok0.idx, text2[tok0.idx]
|
|
||||||
|
|
||||||
tok1 = list(b.sents)[0][1]
|
|
||||||
print tok1, tok1.idx, text2[tok1.idx]
|
|
||||||
'''
|
|
||||||
doc = en_tokenizer(u" This is a cat.")
|
|
||||||
assert doc[0].idx == 0
|
|
||||||
assert len(doc[0]) == 3
|
|
||||||
assert doc[1].idx == 3
|
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from spacy.util import utf8open
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
|
|
||||||
HERE = path.dirname(__file__)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sun_txt():
|
|
||||||
loc = path.join(HERE, '..', 'sun.txt')
|
|
||||||
return utf8open(loc).read()
|
|
||||||
|
|
||||||
|
|
||||||
def test_tokenize(sun_txt, en_tokenizer):
|
|
||||||
assert len(sun_txt) != 0
|
|
||||||
tokens = en_tokenizer(sun_txt)
|
|
||||||
assert len(tokens) > 100
|
|
|
@ -1,20 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
import pytest
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
|
||||||
def nlp():
|
|
||||||
from spacy.en import English
|
|
||||||
if os.environ.get('SPACY_DATA'):
|
|
||||||
data_dir = os.environ.get('SPACY_DATA')
|
|
||||||
else:
|
|
||||||
data_dir = True
|
|
||||||
return English(path=data_dir)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def doc(nlp):
|
|
||||||
for word in ['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']:
|
|
||||||
_ = nlp.vocab[word]
|
|
||||||
return nlp('Hello, world. Here are two sentences.')
|
|
|
@ -1,172 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
import pytest
|
|
||||||
from spacy.attrs import HEAD
|
|
||||||
import numpy
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_example_war_and_peace(nlp):
|
|
||||||
# from spacy.en import English
|
|
||||||
from spacy._doc_examples import download_war_and_peace
|
|
||||||
|
|
||||||
unprocessed_unicode = download_war_and_peace()
|
|
||||||
|
|
||||||
# nlp = English()
|
|
||||||
# TODO: ImportError: No module named _doc_examples
|
|
||||||
doc = nlp(unprocessed_unicode)
|
|
||||||
|
|
||||||
|
|
||||||
def test_main_entry_point(nlp):
|
|
||||||
# from spacy.en import English
|
|
||||||
# nlp = English()
|
|
||||||
doc = nlp('Some text.') # Applies tagger, parser, entity
|
|
||||||
doc = nlp('Some text.', parse=False) # Applies tagger and entity, not parser
|
|
||||||
doc = nlp('Some text.', entity=False) # Applies tagger and parser, not entity
|
|
||||||
doc = nlp('Some text.', tag=False) # Does not apply tagger, entity or parser
|
|
||||||
doc = nlp('') # Zero-length tokens, not an error
|
|
||||||
# doc = nlp(b'Some text') <-- Error: need unicode
|
|
||||||
doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_sentence_spans(nlp):
|
|
||||||
# from spacy.en import English
|
|
||||||
# nlp = English()
|
|
||||||
doc = nlp("This is a sentence. Here's another...")
|
|
||||||
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_entity_spans(nlp):
|
|
||||||
# from spacy.en import English
|
|
||||||
# nlp = English()
|
|
||||||
tokens = nlp('Mr. Best flew to New York on Saturday morning.')
|
|
||||||
ents = list(tokens.ents)
|
|
||||||
assert ents[0].label == 346
|
|
||||||
assert ents[0].label_ == 'PERSON'
|
|
||||||
assert ents[0].orth_ == 'Best'
|
|
||||||
assert ents[0].string == ents[0].string
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_noun_chunk_spans(nlp):
|
|
||||||
# from spacy.en import English
|
|
||||||
# nlp = English()
|
|
||||||
doc = nlp('The sentence in this example has three noun chunks.')
|
|
||||||
for chunk in doc.noun_chunks:
|
|
||||||
print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
|
|
||||||
|
|
||||||
# NP The sentence <-- has
|
|
||||||
# NP this example <-- in
|
|
||||||
# NP three noun chunks <-- has
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_count_by(nlp):
|
|
||||||
# from spacy.en import English, attrs
|
|
||||||
# nlp = English()
|
|
||||||
import numpy
|
|
||||||
from spacy import attrs
|
|
||||||
tokens = nlp('apple apple orange banana')
|
|
||||||
assert tokens.count_by(attrs.ORTH) == {3699: 2, 3750: 1, 5965: 1}
|
|
||||||
assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[3699],
|
|
||||||
[3699],
|
|
||||||
[3750],
|
|
||||||
[5965]], dtype=numpy.int32))
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_read_bytes(nlp):
|
|
||||||
from spacy.tokens.doc import Doc
|
|
||||||
loc = 'test_serialize.bin'
|
|
||||||
with open(loc, 'wb') as file_:
|
|
||||||
file_.write(nlp(u'This is a document.').to_bytes())
|
|
||||||
file_.write(nlp(u'This is another.').to_bytes())
|
|
||||||
docs = []
|
|
||||||
with open(loc, 'rb') as file_:
|
|
||||||
for byte_string in Doc.read_bytes(file_):
|
|
||||||
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
|
||||||
assert len(docs) == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_token_span(doc):
|
|
||||||
span = doc[4:6]
|
|
||||||
token = span[0]
|
|
||||||
assert token.i == 4
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_example_i_like_new_york1(nlp):
|
|
||||||
toks = nlp('I like New York in Autumn.')
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def toks(nlp):
|
|
||||||
doc = nlp('I like New York in Autumn.')
|
|
||||||
doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_example_i_like_new_york2(toks):
|
|
||||||
i, like, new, york, in_, autumn, dot = range(len(toks))
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def tok(toks, tok):
|
|
||||||
i, like, new, york, in_, autumn, dot = range(len(toks))
|
|
||||||
return locals()[tok]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def new(toks):
|
|
||||||
return tok(toks, "new")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def york(toks):
|
|
||||||
return tok(toks, "york")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def autumn(toks):
|
|
||||||
return tok(toks, "autumn")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def dot(toks):
|
|
||||||
return tok(toks, "dot")
|
|
||||||
|
|
||||||
|
|
||||||
def test_example_i_like_new_york3(toks, new, york):
|
|
||||||
assert toks[new].head.orth_ == 'York'
|
|
||||||
assert toks[york].head.orth_ == 'like'
|
|
||||||
|
|
||||||
|
|
||||||
def test_example_i_like_new_york4(toks, new, york):
|
|
||||||
new_york = toks[new:york+1]
|
|
||||||
assert new_york.root.orth_ == 'York'
|
|
||||||
|
|
||||||
|
|
||||||
def test_example_i_like_new_york5(toks, autumn, dot):
|
|
||||||
assert toks[autumn].head.orth_ == 'in'
|
|
||||||
assert toks[dot].head.orth_ == 'like'
|
|
||||||
autumn_dot = toks[autumn:]
|
|
||||||
assert autumn_dot.root.orth_ == 'Autumn'
|
|
||||||
|
|
||||||
|
|
||||||
def test_navigating_the_parse_tree_lefts(doc):
|
|
||||||
# TODO: where does the span object come from?
|
|
||||||
span = doc[:2]
|
|
||||||
lefts = [span.doc[i] for i in range(0, span.start)
|
|
||||||
if span.doc[i].head in span]
|
|
||||||
|
|
||||||
|
|
||||||
def test_navigating_the_parse_tree_rights(doc):
|
|
||||||
span = doc[:2]
|
|
||||||
rights = [span.doc[i] for i in range(span.end, len(span.doc))
|
|
||||||
if span.doc[i].head in span]
|
|
||||||
|
|
||||||
|
|
||||||
def test_string_store(doc):
|
|
||||||
string_store = doc.vocab.strings
|
|
||||||
for i, string in enumerate(string_store):
|
|
||||||
assert i == string_store[string]
|
|
|
@ -1,180 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
import pytest
|
|
||||||
import spacy
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
xrange
|
|
||||||
except NameError:
|
|
||||||
xrange = range
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def token(doc):
|
|
||||||
return doc[0]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_load_resources_and_process_text():
|
|
||||||
from spacy.en import English
|
|
||||||
nlp = English()
|
|
||||||
doc = nlp(u'Hello, world. Here are two sentences.')
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_get_tokens_and_sentences(doc):
|
|
||||||
token = doc[0]
|
|
||||||
sentence = next(doc.sents)
|
|
||||||
assert token is sentence[0]
|
|
||||||
assert sentence.text == 'Hello, world.'
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_use_integer_ids_for_any_strings(nlp, token):
|
|
||||||
hello_id = nlp.vocab.strings['Hello']
|
|
||||||
hello_str = nlp.vocab.strings[hello_id]
|
|
||||||
|
|
||||||
assert token.orth == hello_id == 3125
|
|
||||||
assert token.orth_ == hello_str == 'Hello'
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_and_set_string_views_and_flags(nlp, token):
|
|
||||||
assert token.shape_ == 'Xxxxx'
|
|
||||||
for lexeme in nlp.vocab:
|
|
||||||
if lexeme.is_alpha:
|
|
||||||
lexeme.shape_ = 'W'
|
|
||||||
elif lexeme.is_digit:
|
|
||||||
lexeme.shape_ = 'D'
|
|
||||||
elif lexeme.is_punct:
|
|
||||||
lexeme.shape_ = 'P'
|
|
||||||
else:
|
|
||||||
lexeme.shape_ = 'M'
|
|
||||||
assert token.shape_ == 'W'
|
|
||||||
|
|
||||||
|
|
||||||
def test_export_to_numpy_arrays(nlp, doc):
|
|
||||||
from spacy.attrs import ORTH, LIKE_URL, IS_OOV
|
|
||||||
|
|
||||||
attr_ids = [ORTH, LIKE_URL, IS_OOV]
|
|
||||||
doc_array = doc.to_array(attr_ids)
|
|
||||||
assert doc_array.shape == (len(doc), len(attr_ids))
|
|
||||||
assert doc[0].orth == doc_array[0, 0]
|
|
||||||
assert doc[1].orth == doc_array[1, 0]
|
|
||||||
assert doc[0].like_url == doc_array[0, 1]
|
|
||||||
assert list(doc_array[:, 1]) == [t.like_url for t in doc]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_word_vectors(nlp):
|
|
||||||
doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
|
|
||||||
|
|
||||||
apples = doc[0]
|
|
||||||
oranges = doc[2]
|
|
||||||
boots = doc[6]
|
|
||||||
hippos = doc[8]
|
|
||||||
|
|
||||||
assert apples.similarity(oranges) > boots.similarity(hippos)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_part_of_speech_tags(nlp):
|
|
||||||
from spacy.parts_of_speech import ADV
|
|
||||||
|
|
||||||
def is_adverb(token):
|
|
||||||
return token.pos == spacy.parts_of_speech.ADV
|
|
||||||
|
|
||||||
# These are data-specific, so no constants are provided. You have to look
|
|
||||||
# up the IDs from the StringStore.
|
|
||||||
NNS = nlp.vocab.strings['NNS']
|
|
||||||
NNPS = nlp.vocab.strings['NNPS']
|
|
||||||
def is_plural_noun(token):
|
|
||||||
return token.tag == NNS or token.tag == NNPS
|
|
||||||
|
|
||||||
def print_coarse_pos(token):
|
|
||||||
print(token.pos_)
|
|
||||||
|
|
||||||
def print_fine_pos(token):
|
|
||||||
print(token.tag_)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_syntactic_dependencies():
|
|
||||||
def dependency_labels_to_root(token):
|
|
||||||
'''Walk up the syntactic tree, collecting the arc labels.'''
|
|
||||||
dep_labels = []
|
|
||||||
while token.head is not token:
|
|
||||||
dep_labels.append(token.dep)
|
|
||||||
token = token.head
|
|
||||||
return dep_labels
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_named_entities():
|
|
||||||
def iter_products(docs):
|
|
||||||
for doc in docs:
|
|
||||||
for ent in doc.ents:
|
|
||||||
if ent.label_ == 'PRODUCT':
|
|
||||||
yield ent
|
|
||||||
|
|
||||||
def word_is_in_entity(word):
|
|
||||||
return word.ent_type != 0
|
|
||||||
|
|
||||||
def count_parent_verb_by_person(docs):
|
|
||||||
counts = defaultdict(defaultdict(int))
|
|
||||||
for doc in docs:
|
|
||||||
for ent in doc.ents:
|
|
||||||
if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
|
|
||||||
counts[ent.orth_][ent.root.head.lemma_] += 1
|
|
||||||
return counts
|
|
||||||
|
|
||||||
|
|
||||||
def test_calculate_inline_mark_up_on_original_string():
|
|
||||||
def put_spans_around_tokens(doc, get_classes):
|
|
||||||
'''Given some function to compute class names, put each token in a
|
|
||||||
span element, with the appropriate classes computed.
|
|
||||||
|
|
||||||
All whitespace is preserved, outside of the spans. (Yes, I know HTML
|
|
||||||
won't display it. But the point is no information is lost, so you can
|
|
||||||
calculate what you need, e.g. <br /> tags, <p> tags, etc.)
|
|
||||||
'''
|
|
||||||
output = []
|
|
||||||
template = '<span classes="{classes}">{word}</span>{space}'
|
|
||||||
for token in doc:
|
|
||||||
if token.is_space:
|
|
||||||
output.append(token.orth_)
|
|
||||||
else:
|
|
||||||
output.append(
|
|
||||||
template.format(
|
|
||||||
classes=' '.join(get_classes(token)),
|
|
||||||
word=token.orth_,
|
|
||||||
space=token.whitespace_))
|
|
||||||
string = ''.join(output)
|
|
||||||
string = string.replace('\n', '')
|
|
||||||
string = string.replace('\t', ' ')
|
|
||||||
return string
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_efficient_binary_serialization(doc):
|
|
||||||
from spacy.tokens.doc import Doc
|
|
||||||
|
|
||||||
byte_string = doc.to_bytes()
|
|
||||||
open('moby_dick.bin', 'wb').write(byte_string)
|
|
||||||
|
|
||||||
nlp = spacy.en.English()
|
|
||||||
for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
|
|
||||||
doc = Doc(nlp.vocab)
|
|
||||||
doc.from_bytes(byte_string)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_multithreading(nlp):
|
|
||||||
texts = [u'One document.', u'...', u'Lots of documents']
|
|
||||||
# .pipe streams input, and produces streaming output
|
|
||||||
iter_texts = (texts[i % 3] for i in xrange(100000000))
|
|
||||||
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
|
|
||||||
assert doc.is_parsed
|
|
||||||
if i == 100:
|
|
||||||
break
|
|
||||||
|
|
|
@ -94,8 +94,13 @@ def read_regex(path):
|
||||||
|
|
||||||
|
|
||||||
def compile_prefix_regex(entries):
|
def compile_prefix_regex(entries):
|
||||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
if '(' in entries:
|
||||||
return re.compile(expression)
|
# Handle deprecated data
|
||||||
|
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||||
|
return re.compile(expression)
|
||||||
|
else:
|
||||||
|
expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
|
||||||
|
return re.compile(expression)
|
||||||
|
|
||||||
|
|
||||||
def compile_suffix_regex(entries):
|
def compile_suffix_regex(entries):
|
||||||
|
|
|
@ -22,7 +22,8 @@
|
||||||
"twitter": "spacy_io",
|
"twitter": "spacy_io",
|
||||||
"github": "explosion",
|
"github": "explosion",
|
||||||
"reddit": "spacynlp",
|
"reddit": "spacynlp",
|
||||||
"codepen": "explosion"
|
"codepen": "explosion",
|
||||||
|
"gitter": "explosion/spaCy"
|
||||||
},
|
},
|
||||||
|
|
||||||
"NAVIGATION": {
|
"NAVIGATION": {
|
||||||
|
@ -53,7 +54,7 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
"V_CSS": "1.10",
|
"V_CSS": "1.14",
|
||||||
"V_JS": "1.0",
|
"V_JS": "1.0",
|
||||||
"DEFAULT_SYNTAX" : "python",
|
"DEFAULT_SYNTAX" : "python",
|
||||||
"ANALYTICS": "UA-58931649-1",
|
"ANALYTICS": "UA-58931649-1",
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
//- 💫 MIXINS > BASE
|
//- 💫 MIXINS > BASE
|
||||||
|
|
||||||
//- Aside wrapper
|
//- Aside wrapper
|
||||||
|
label - [string] aside label
|
||||||
|
|
||||||
mixin aside-wrapper(label)
|
mixin aside-wrapper(label)
|
||||||
aside.c-aside
|
aside.c-aside
|
||||||
|
@ -21,6 +22,10 @@ mixin date(input)
|
||||||
|
|
||||||
|
|
||||||
//- SVG from map
|
//- SVG from map
|
||||||
|
file - [string] SVG file name in /assets/img/
|
||||||
|
name - [string] SVG symbol id
|
||||||
|
width - [integer] width in px
|
||||||
|
height - [integer] height in px (default: same as width)
|
||||||
|
|
||||||
mixin svg(file, name, width, height)
|
mixin svg(file, name, width, height)
|
||||||
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
|
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
|
||||||
|
@ -28,19 +33,23 @@ mixin svg(file, name, width, height)
|
||||||
|
|
||||||
|
|
||||||
//- Icon
|
//- Icon
|
||||||
|
name - [string] icon name, should be SVG symbol ID
|
||||||
|
size - [integer] icon width and height (default: 20)
|
||||||
|
|
||||||
mixin icon(name, size)
|
mixin icon(name, size)
|
||||||
+svg("icons", "icon-" + name, size || 20).o-icon&attributes(attributes)
|
+svg("icons", name, size || 20).o-icon&attributes(attributes)
|
||||||
|
|
||||||
|
|
||||||
//- Pro/Con/Neutral icon
|
//- Pro/Con/Neutral icon
|
||||||
|
icon - [string] "pro", "con" or "neutral" (default: "neutral")
|
||||||
|
|
||||||
mixin procon(icon)
|
mixin procon(icon)
|
||||||
- colors = { pro: "green", con: "red" }
|
- colors = { pro: "green", con: "red", neutral: "yellow" }
|
||||||
+icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
|
+icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
|
||||||
|
|
||||||
|
|
||||||
//- Headlines Helper Mixin
|
//- Headlines Helper Mixin
|
||||||
|
level - [integer] 1, 2, 3, 4, or 5
|
||||||
|
|
||||||
mixin headline(level)
|
mixin headline(level)
|
||||||
if level == 1
|
if level == 1
|
||||||
|
@ -65,6 +74,7 @@ mixin headline(level)
|
||||||
|
|
||||||
|
|
||||||
//- Permalink rendering
|
//- Permalink rendering
|
||||||
|
id - [string] permalink ID used for link anchor
|
||||||
|
|
||||||
mixin permalink(id)
|
mixin permalink(id)
|
||||||
if id
|
if id
|
||||||
|
@ -77,6 +87,7 @@ mixin permalink(id)
|
||||||
|
|
||||||
|
|
||||||
//- Terminal-style code window
|
//- Terminal-style code window
|
||||||
|
label - [string] title displayed in top bar of terminal window
|
||||||
|
|
||||||
mixin terminal(label)
|
mixin terminal(label)
|
||||||
.x-terminal
|
.x-terminal
|
||||||
|
@ -87,6 +98,18 @@ mixin terminal(label)
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
|
//- Gitter chat button and widget
|
||||||
|
button - [string] text shown on button
|
||||||
|
label - [string] title of chat window (default: same as button)
|
||||||
|
|
||||||
|
mixin gitter(button, label)
|
||||||
|
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
|
||||||
|
|
||||||
|
button.js-gitter-button.c-chat__button.u-text-small
|
||||||
|
+icon("chat").o-icon--inline
|
||||||
|
!=button
|
||||||
|
|
||||||
|
|
||||||
//- Logo
|
//- Logo
|
||||||
|
|
||||||
mixin logo()
|
mixin logo()
|
||||||
|
|
|
@ -44,7 +44,7 @@ mixin api(path)
|
||||||
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block
|
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block
|
||||||
block
|
block
|
||||||
|
|
||||||
| #[+icon("book", 18).o-icon--inline.u-help.u-color-subtle]
|
| #[+icon("book", 18).o-icon--inline.u-color-subtle]
|
||||||
|
|
||||||
|
|
||||||
//- Aside for text
|
//- Aside for text
|
||||||
|
|
|
@ -24,4 +24,6 @@ main.o-main.o-main--sidebar.o-main--aside
|
||||||
.o-inline-list
|
.o-inline-list
|
||||||
+button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)]
|
+button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)]
|
||||||
|
|
||||||
|
+gitter("spaCy chat")
|
||||||
|
|
||||||
include _footer
|
include _footer
|
||||||
|
|
23
website/_includes/_scripts.jade
Normal file
23
website/_includes/_scripts.jade
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
//- 💫 INCLUDES > SCRIPTS
|
||||||
|
|
||||||
|
script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
|
||||||
|
script(src="/assets/js/prism.js", type="text/javascript")
|
||||||
|
|
||||||
|
if SECTION == "docs"
|
||||||
|
script.
|
||||||
|
((window.gitter = {}).chat = {}).options = {
|
||||||
|
useStyles: false,
|
||||||
|
activationElement: '.js-gitter-button',
|
||||||
|
targetElement: '.js-gitter',
|
||||||
|
room: '!{SOCIAL.gitter}'
|
||||||
|
};
|
||||||
|
|
||||||
|
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
|
||||||
|
|
||||||
|
if environment == "deploy"
|
||||||
|
script
|
||||||
|
| window.ga=window.ga||function(){
|
||||||
|
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
|
||||||
|
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
|
||||||
|
|
||||||
|
script(async src="https://www.google-analytics.com/analytics.js")
|
|
@ -52,13 +52,4 @@ html(lang="en")
|
||||||
main!=yield
|
main!=yield
|
||||||
include _includes/_footer
|
include _includes/_footer
|
||||||
|
|
||||||
script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
|
include _includes/_scripts
|
||||||
script(src="/assets/js/prism.js", type="text/javascript")
|
|
||||||
|
|
||||||
if environment == "deploy"
|
|
||||||
script
|
|
||||||
| window.ga=window.ga||function(){
|
|
||||||
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
|
|
||||||
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
|
|
||||||
|
|
||||||
script(async src="https://www.google-analytics.com/analytics.js")
|
|
||||||
|
|
|
@ -6,36 +6,36 @@
|
||||||
font-family: "Source Sans Pro"
|
font-family: "Source Sans Pro"
|
||||||
font-style: normal
|
font-style: normal
|
||||||
font-weight: 400
|
font-weight: 400
|
||||||
src: url("../fonts/sourcesanspro-regular.eot")
|
src: url("/assets/fonts/sourcesanspro-regular.eot")
|
||||||
src: url("../fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-regular.woff2") format("woff2"), url("../fonts/sourcesanspro-regular.woff") format("woff"), url("../fonts/sourcesanspro-regular.ttf") format("truetype"), url("../fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg")
|
src: url("/assets/fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-regular.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-regular.woff") format("woff"), url("/assets/fonts/sourcesanspro-regular.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg")
|
||||||
|
|
||||||
@font-face
|
@font-face
|
||||||
font-family: "Source Sans Pro"
|
font-family: "Source Sans Pro"
|
||||||
font-style: italic
|
font-style: italic
|
||||||
font-weight: 400
|
font-weight: 400
|
||||||
src: url("../fonts/sourcesanspro-italic.eot")
|
src: url("/assets/fonts/sourcesanspro-italic.eot")
|
||||||
src: url("../fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-italic.woff2") format("woff2"), url("../fonts/sourcesanspro-italic.woff") format("woff"), url("../fonts/sourcesanspro-italic.ttf") format("truetype"), url("../fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg")
|
src: url("/assets/fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-italic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-italic.woff") format("woff"), url("/assets/fonts/sourcesanspro-italic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg")
|
||||||
|
|
||||||
@font-face
|
@font-face
|
||||||
font-family: "Source Sans Pro"
|
font-family: "Source Sans Pro"
|
||||||
font-style: normal
|
font-style: normal
|
||||||
font-weight: 700
|
font-weight: 700
|
||||||
src: url("../fonts/sourcesanspro-bold.eot")
|
src: url("/assets/fonts/sourcesanspro-bold.eot")
|
||||||
src: url("../fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bold.woff2") format("woff2"), url("../fonts/sourcesanspro-bold.woff") format("woff"), url("../fonts/sourcesanspro-bold.ttf") format("truetype"), url("../fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg")
|
src: url("/assets/fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bold.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bold.woff") format("woff"), url("/assets/fonts/sourcesanspro-bold.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg")
|
||||||
|
|
||||||
@font-face
|
@font-face
|
||||||
font-family: "Source Sans Pro"
|
font-family: "Source Sans Pro"
|
||||||
font-style: italic
|
font-style: italic
|
||||||
font-weight: 700
|
font-weight: 700
|
||||||
src: url("../fonts/sourcesanspro-bolditalic.eot")
|
src: url("/assets/fonts/sourcesanspro-bolditalic.eot")
|
||||||
src: url("../fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("../fonts/sourcesanspro-bolditalic.woff") format("woff"), url("../fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("../fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg")
|
src: url("/assets/fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bolditalic.woff") format("woff"), url("/assets/fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg")
|
||||||
|
|
||||||
|
|
||||||
// Source Code Pro
|
// Source Code Pro
|
||||||
|
|
||||||
@font-face
|
@font-face
|
||||||
font-family: "Source Code Pro"
|
font-family: "Source Code Pro"
|
||||||
font-style: normal
|
font-style: normal
|
||||||
font-weight: 600
|
font-weight: 600
|
||||||
src: url("../fonts/sourcecodepro-semibold.eot")
|
src: url("/assets/fonts/sourcecodepro-semibold.eot")
|
||||||
src: url("../fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcecodepro-semibold.woff") format("woff"), url("../fonts/sourcecodepro-semibold.ttf") format("truetype"), url("../fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg")
|
src: url("/assets/fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcecodepro-semibold.woff") format("woff"), url("/assets/fonts/sourcecodepro-semibold.ttf") format("truetype"), url("/assets/fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg")
|
||||||
|
|
|
@ -60,7 +60,7 @@
|
||||||
background: $color-back
|
background: $color-back
|
||||||
border-radius: 2px
|
border-radius: 2px
|
||||||
border: 1px solid $color-subtle
|
border: 1px solid $color-subtle
|
||||||
padding: 3.5% 2.5%
|
padding: 3rem 2.5%
|
||||||
|
|
||||||
//- Icons
|
//- Icons
|
||||||
|
|
||||||
|
|
|
@ -141,12 +141,6 @@
|
||||||
background: $pattern
|
background: $pattern
|
||||||
|
|
||||||
|
|
||||||
//- Cursors
|
|
||||||
|
|
||||||
.u-help
|
|
||||||
cursor: help
|
|
||||||
|
|
||||||
|
|
||||||
//- Hidden elements
|
//- Hidden elements
|
||||||
|
|
||||||
.u-hidden
|
.u-hidden
|
||||||
|
|
100
website/assets/css/_components/_chat.sass
Normal file
100
website/assets/css/_components/_chat.sass
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
//- 💫 CSS > COMPONENTS > CHAT
|
||||||
|
|
||||||
|
.c-chat
|
||||||
|
@include position(fixed, top, left, 0, 60%)
|
||||||
|
bottom: 0
|
||||||
|
right: 0
|
||||||
|
display: flex
|
||||||
|
flex-flow: column nowrap
|
||||||
|
background: $color-back
|
||||||
|
transition: transform 0.3s cubic-bezier(0.16, 0.22, 0.22, 1.7)
|
||||||
|
box-shadow: -0.25rem 0 1rem 0 rgba($color-front, 0.25)
|
||||||
|
z-index: 100
|
||||||
|
|
||||||
|
@include breakpoint(min, md)
|
||||||
|
left: calc(100% - #{$aside-width} - #{$aside-padding})
|
||||||
|
|
||||||
|
@include breakpoint(max, sm)
|
||||||
|
left: 50%
|
||||||
|
|
||||||
|
@include breakpoint(max, xs)
|
||||||
|
left: 0
|
||||||
|
|
||||||
|
&.is-collapsed:not(.is-loading)
|
||||||
|
transform: translateX(110%)
|
||||||
|
|
||||||
|
&:before
|
||||||
|
@include position(absolute, top, left, 1rem, 2rem)
|
||||||
|
content: attr(data-title)
|
||||||
|
font: bold 1.4rem $font-code
|
||||||
|
text-transform: uppercase
|
||||||
|
color: $color-back
|
||||||
|
|
||||||
|
&:after
|
||||||
|
@include position(absolute, top, left, 0, 100%)
|
||||||
|
content: ""
|
||||||
|
z-index: -1
|
||||||
|
bottom: 0
|
||||||
|
right: -100%
|
||||||
|
background: $color-back
|
||||||
|
|
||||||
|
& > iframe
|
||||||
|
width: 100%
|
||||||
|
flex: 1 1 calc(100% - #{$nav-height})
|
||||||
|
border: 0
|
||||||
|
|
||||||
|
.gitter-chat-embed-loading-wrapper
|
||||||
|
@include position(absolute, top, left, 0, 0)
|
||||||
|
right: 0
|
||||||
|
bottom: 0
|
||||||
|
display: none
|
||||||
|
justify-content: center
|
||||||
|
align-items: center
|
||||||
|
|
||||||
|
.is-loading &
|
||||||
|
display: flex
|
||||||
|
|
||||||
|
.gitter-chat-embed-action-bar,
|
||||||
|
.gitter-chat-embed-action-bar-item
|
||||||
|
display: flex
|
||||||
|
|
||||||
|
.gitter-chat-embed-action-bar
|
||||||
|
align-items: center
|
||||||
|
justify-content: flex-end
|
||||||
|
background: $color-theme
|
||||||
|
padding: 0 1rem 0 2rem
|
||||||
|
flex: 0 0 $nav-height
|
||||||
|
|
||||||
|
.gitter-chat-embed-action-bar-item
|
||||||
|
@include size(40px)
|
||||||
|
padding: 0
|
||||||
|
opacity: 0.75
|
||||||
|
background-position: 50%
|
||||||
|
background-repeat: no-repeat
|
||||||
|
background-size: 22px 22px
|
||||||
|
border: 0
|
||||||
|
cursor: pointer
|
||||||
|
transition: all 0.2s ease
|
||||||
|
|
||||||
|
&:focus,
|
||||||
|
&:hover
|
||||||
|
opacity: 1
|
||||||
|
|
||||||
|
&.gitter-chat-embed-action-bar-item-pop-out
|
||||||
|
background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyMCIgaGVpZ2h0PSIyMCIgdmlld0JveD0iMCAwIDIwIDIwIj48cGF0aCBmaWxsPSIjZmZmIiBkPSJNMTYgMmgtOC4wMjFjLTEuMDk5IDAtMS45NzkgMC44OC0xLjk3OSAxLjk4djguMDIwYzAgMS4xIDAuOSAyIDIgMmg4YzEuMSAwIDItMC45IDItMnYtOGMwLTEuMS0wLjktMi0yLTJ6TTE2IDEyaC04di04aDh2OHpNNCAxMGgtMnY2YzAgMS4xIDAuOSAyIDIgMmg2di0yaC02di02eiI+PC9wYXRoPjwvc3ZnPg==)
|
||||||
|
margin-right: -4px
|
||||||
|
|
||||||
|
&.gitter-chat-embed-action-bar-item-collapse-chat
|
||||||
|
background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0Ij48cGF0aCBmaWxsPSIjZmZmIiBkPSJNMTguOTg0IDYuNDIybC01LjU3OCA1LjU3OCA1LjU3OCA1LjU3OC0xLjQwNiAxLjQwNi01LjU3OC01LjU3OC01LjU3OCA1LjU3OC0xLjQwNi0xLjQwNiA1LjU3OC01LjU3OC01LjU3OC01LjU3OCAxLjQwNi0xLjQwNiA1LjU3OCA1LjU3OCA1LjU3OC01LjU3OHoiPjwvcGF0aD48L3N2Zz4=)
|
||||||
|
|
||||||
|
.c-chat__button
|
||||||
|
@include position(fixed, bottom, right, 0, 2rem)
|
||||||
|
padding: 1rem 1.5rem
|
||||||
|
background: $color-front
|
||||||
|
color: $color-back
|
||||||
|
border-top-left-radius: 4px
|
||||||
|
border-top-right-radius: 4px
|
||||||
|
z-index: 20
|
||||||
|
border-color: $color-theme
|
||||||
|
border-style: solid
|
||||||
|
border-width: 1px 1px 0 1px
|
|
@ -24,6 +24,7 @@ $theme: blue !default
|
||||||
|
|
||||||
@import _components/asides
|
@import _components/asides
|
||||||
@import _components/buttons
|
@import _components/buttons
|
||||||
|
@import _components/chat
|
||||||
@import _components/code
|
@import _components/code
|
||||||
@import _components/landing
|
@import _components/landing
|
||||||
@import _components/lists
|
@import _components/lists
|
||||||
|
|
|
@ -64,5 +64,6 @@
|
||||||
<symbol id="matt-signature" viewBox="0 0 500 250">
|
<symbol id="matt-signature" viewBox="0 0 500 250">
|
||||||
<title>matt-signature</title>
|
<title>matt-signature</title>
|
||||||
<path fill="currentColor" d="M18.6 207c-.3-18.8-.8-37.5-1.4-56.2-.6-18.7-1-37.5-1-56.2v-7.2c0-3.5 0-7 .2-11v-18c.8-2.7 1.8-5 3-6.5 1.6-2 3.6-3 6.4-3 3 0 5.4 1 7.6 2 2.2 2 4 4 5.3 6l36.6 71 1.8 3c1 1 2 3 3 3h1l1 1 1-3 22-76c2-3 3-5 4-8l2-9c1-3 2-6 4-8 1-3 4-5 7-7h2c5 0 8 1 10 4 3 2 4 5 5 9 1 3 2 7 1 12v11l1 7c0 3 0 7 1 12 0 4 1 9 1 14l1 14.2 1 12 .6 6v1l1 7.5 1 11.6 1.4 12 1.4 8 1 4 1.7 5.5 1.7 6c.7 1.7 1 3 1.5 3.6-.5 4-1.5 7-3 9-1 2-4 3-8 3h-6l-3-3c-1-1.4-2-2.3-2-3l-4-14-7.6-58V88c0-3.5-1-7-2-10l-2 1.7-18 74v6c0 2-.2 4-1 6 0 2-1 3.5-3 5-1 1.3-3 2-5 2.2-1 0-2 0-3-1l-3.4-2-3-3c-1-1-1.7-2-2-3l-35-52-5.3-10.6v22c0 10.2.2 20.3.6 30.2.4 10 .6 20 .6 30.2v22c0 2-1 4-3 5.4s-3 3-5 3c-3 0-5 0-7-1-1-1-3-3-4-5zm205-63.2c-1.6 2.7-3.4 6-5.3 9.8l-6.2 12.2c-2 4.3-4 8.6-7 13-2 4.2-5 8.2-8 11.7s-5 6.6-9 9c-3 2.5-6 4-9 4.4-1 0-3-1-4-1l-5-2c-1-1-3-2-4-3s-1-3-1-5c1-18 2-33 4-47s6-27 11-38 12-20 20-27 18-12 29-15l2-1h2c5 0 9 2 11 7s4 12 5 23c1 10 2 24 2 40 1 16 2 36 3 59l1 4v5c0 2.6-1 4.5-2 6s-3 2-5 2c-5 0-8-1.7-10-4s-3-6.6-4-11v-4l-1-9s-1-6.7-1-10l-1-8.5v-1l-.2-6-1-7-.5-8.6-1-1zM218 93.5c-4.7 3.4-9.2 8-13.6 13.7-4.4 5.8-7.5 11.3-9.4 16.8-.8 2.5-1.8 6-2.8 10.4-1 4.4-2 8.8-2.7 13l-2 12-.7 7c.2 0 .4-.2.6-.5l.6-1c10.5-10 18-21 22.2-33 4.6-12 7-25 7.7-39zm72 47c-2.3 0-4.4.6-6.2 1.8-2 1.2-4 1.8-6.6 1.8h-5.4c-.7-1-1.4-1-2.3-2l-2.5-2c-.8 0-1.6-1-2.2-2-.6-1-1-2-1-3 0-2 1-4 3-6 2-1 4.5-3 7.2-4l8.3-3s5-2 6.7-3v-11c0-12-.6-25-1.8-38-1.2-12-1.8-25-1.8-37 0-3 .8-6 2.5-7 1-1 4-1 6-1 3 0 6 1 7 3s2 4 3 7c0 3 1 6 1 9v20l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2.3 9-3.4 14-3.4 3 0 6 1 7 3.5s3 5 3 8c0 2-1 4-3 5l-6 3-46 17-1.5 1s-1 0-1.5 1v8c0 6 0 12 .5 18s1 12.3 2 18.3l3 15c1 5 1.4 10 1.4 15 0 1.4-.6 3.5-1.6 6s-2 4-4.7 4c-5 0-8.7-1.6-11.6-4-3-3-4.3-6.6-4.6-11l-2.2-29-2.7-30h-1zm112 0c-2.4 0-4.5.6-6.3 1.8-2 1.2-4 1.8-6.6 1.8h-5c0-1-1-1-2-2l-2-2c-1 0-1-1-2-2 0-1-1-2-1-3 0-2 1-4 3-6 2-1 5-3 7-4l8-3s5-2 7-3v-11c0-12 0-25-2-38-1-12-1-25-1-37 0-3 1-6 3-7s4-1 7-1c4 0 6 1 8 3s3 4 3 7c1 3 1 6 1 9s0 6 1 8v11l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2 9-4 14-4 4 0 6 1 8 4s3 5 3 8c0 2-1 4-2 5l-5.3 3-49 13.8-1.5 1s-1 .5-1.5 1V157l1 18.3c0 5 1 10 2 15s1 10 1 15c0 1.5-1 3.6-2 6s-3 4-5 4c-5 0-9-1.5-12-4.2s-5-6-5-11l-3-28.3-3-30.3h-1z"/>
|
<path fill="currentColor" d="M18.6 207c-.3-18.8-.8-37.5-1.4-56.2-.6-18.7-1-37.5-1-56.2v-7.2c0-3.5 0-7 .2-11v-18c.8-2.7 1.8-5 3-6.5 1.6-2 3.6-3 6.4-3 3 0 5.4 1 7.6 2 2.2 2 4 4 5.3 6l36.6 71 1.8 3c1 1 2 3 3 3h1l1 1 1-3 22-76c2-3 3-5 4-8l2-9c1-3 2-6 4-8 1-3 4-5 7-7h2c5 0 8 1 10 4 3 2 4 5 5 9 1 3 2 7 1 12v11l1 7c0 3 0 7 1 12 0 4 1 9 1 14l1 14.2 1 12 .6 6v1l1 7.5 1 11.6 1.4 12 1.4 8 1 4 1.7 5.5 1.7 6c.7 1.7 1 3 1.5 3.6-.5 4-1.5 7-3 9-1 2-4 3-8 3h-6l-3-3c-1-1.4-2-2.3-2-3l-4-14-7.6-58V88c0-3.5-1-7-2-10l-2 1.7-18 74v6c0 2-.2 4-1 6 0 2-1 3.5-3 5-1 1.3-3 2-5 2.2-1 0-2 0-3-1l-3.4-2-3-3c-1-1-1.7-2-2-3l-35-52-5.3-10.6v22c0 10.2.2 20.3.6 30.2.4 10 .6 20 .6 30.2v22c0 2-1 4-3 5.4s-3 3-5 3c-3 0-5 0-7-1-1-1-3-3-4-5zm205-63.2c-1.6 2.7-3.4 6-5.3 9.8l-6.2 12.2c-2 4.3-4 8.6-7 13-2 4.2-5 8.2-8 11.7s-5 6.6-9 9c-3 2.5-6 4-9 4.4-1 0-3-1-4-1l-5-2c-1-1-3-2-4-3s-1-3-1-5c1-18 2-33 4-47s6-27 11-38 12-20 20-27 18-12 29-15l2-1h2c5 0 9 2 11 7s4 12 5 23c1 10 2 24 2 40 1 16 2 36 3 59l1 4v5c0 2.6-1 4.5-2 6s-3 2-5 2c-5 0-8-1.7-10-4s-3-6.6-4-11v-4l-1-9s-1-6.7-1-10l-1-8.5v-1l-.2-6-1-7-.5-8.6-1-1zM218 93.5c-4.7 3.4-9.2 8-13.6 13.7-4.4 5.8-7.5 11.3-9.4 16.8-.8 2.5-1.8 6-2.8 10.4-1 4.4-2 8.8-2.7 13l-2 12-.7 7c.2 0 .4-.2.6-.5l.6-1c10.5-10 18-21 22.2-33 4.6-12 7-25 7.7-39zm72 47c-2.3 0-4.4.6-6.2 1.8-2 1.2-4 1.8-6.6 1.8h-5.4c-.7-1-1.4-1-2.3-2l-2.5-2c-.8 0-1.6-1-2.2-2-.6-1-1-2-1-3 0-2 1-4 3-6 2-1 4.5-3 7.2-4l8.3-3s5-2 6.7-3v-11c0-12-.6-25-1.8-38-1.2-12-1.8-25-1.8-37 0-3 .8-6 2.5-7 1-1 4-1 6-1 3 0 6 1 7 3s2 4 3 7c0 3 1 6 1 9v20l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2.3 9-3.4 14-3.4 3 0 6 1 7 3.5s3 5 3 8c0 2-1 4-3 5l-6 3-46 17-1.5 1s-1 0-1.5 1v8c0 6 0 12 .5 18s1 12.3 2 18.3l3 15c1 5 1.4 10 1.4 15 0 1.4-.6 3.5-1.6 6s-2 4-4.7 4c-5 0-8.7-1.6-11.6-4-3-3-4.3-6.6-4.6-11l-2.2-29-2.7-30h-1zm112 0c-2.4 0-4.5.6-6.3 1.8-2 1.2-4 1.8-6.6 1.8h-5c0-1-1-1-2-2l-2-2c-1 0-1-1-2-2 0-1-1-2-1-3 0-2 1-4 3-6 2-1 5-3 7-4l8-3s5-2 7-3v-11c0-12 0-25-2-38-1-12-1-25-1-37 0-3 1-6 3-7s4-1 7-1c4 0 6 1 8 3s3 4 3 7c1 3 1 6 1 9s0 6 1 8v11l1 18 1 18 1 12 4-1 6-2 6-2 4-1 14-6c4-2 9-4 14-4 4 0 6 1 8 4s3 5 3 8c0 2-1 4-2 5l-5.3 3-49 13.8-1.5 1s-1 .5-1.5 1V157l1 18.3c0 5 1 10 2 15s1 10 1 15c0 1.5-1 3.6-2 6s-3 4-5 4c-5 0-9-1.5-12-4.2s-5-6-5-11l-3-28.3-3-30.3h-1z"/>
|
||||||
|
</symbol>
|
||||||
</defs>
|
</defs>
|
||||||
</svg>
|
</svg>
|
||||||
|
|
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 15 KiB |
|
@ -1,32 +1,28 @@
|
||||||
<svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
<svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||||
<defs>
|
<defs>
|
||||||
<symbol id="icon-github" viewBox="0 0 27 32">
|
<symbol id="github" viewBox="0 0 27 32">
|
||||||
<title>github</title>
|
<path d="M13.714 2.286q3.732 0 6.884 1.839t4.991 4.991 1.839 6.884q0 4.482-2.616 8.063t-6.759 4.955q-0.482 0.089-0.714-0.125t-0.232-0.536q0-0.054 0.009-1.366t0.009-2.402q0-1.732-0.929-2.536 1.018-0.107 1.83-0.321t1.679-0.696 1.446-1.188 0.946-1.875 0.366-2.688q0-2.125-1.411-3.679 0.661-1.625-0.143-3.643-0.5-0.161-1.446 0.196t-1.643 0.786l-0.679 0.429q-1.661-0.464-3.429-0.464t-3.429 0.464q-0.286-0.196-0.759-0.482t-1.491-0.688-1.518-0.241q-0.804 2.018-0.143 3.643-1.411 1.554-1.411 3.679 0 1.518 0.366 2.679t0.938 1.875 1.438 1.196 1.679 0.696 1.83 0.321q-0.696 0.643-0.875 1.839-0.375 0.179-0.804 0.268t-1.018 0.089-1.17-0.384-0.991-1.116q-0.339-0.571-0.866-0.929t-0.884-0.429l-0.357-0.054q-0.375 0-0.518 0.080t-0.089 0.205 0.161 0.25 0.232 0.214l0.125 0.089q0.393 0.179 0.777 0.679t0.563 0.911l0.179 0.411q0.232 0.679 0.786 1.098t1.196 0.536 1.241 0.125 0.991-0.063l0.411-0.071q0 0.679 0.009 1.58t0.009 0.973q0 0.321-0.232 0.536t-0.714 0.125q-4.143-1.375-6.759-4.955t-2.616-8.063q0-3.732 1.839-6.884t4.991-4.991 6.884-1.839zM5.196 21.982q0.054-0.125-0.125-0.214-0.179-0.054-0.232 0.036-0.054 0.125 0.125 0.214 0.161 0.107 0.232-0.036zM5.75 22.589q0.125-0.089-0.036-0.286-0.179-0.161-0.286-0.054-0.125 0.089 0.036 0.286 0.179 0.179 0.286 0.054zM6.286 23.393q0.161-0.125 0-0.339-0.143-0.232-0.304-0.107-0.161 0.089 0 0.321t0.304 0.125zM7.036 24.143q0.143-0.143-0.071-0.339-0.214-0.214-0.357-0.054-0.161 0.143 0.071 0.339 0.214 0.214 0.357 0.054zM8.054 24.589q0.054-0.196-0.232-0.286-0.268-0.071-0.339 0.125t0.232 0.268q0.268 0.107 0.339-0.107zM9.179 24.679q0-0.232-0.304-0.196-0.286 0-0.286 0.196 0 0.232 0.304 0.196 0.286 0 0.286-0.196zM10.214 24.5q-0.036-0.196-0.321-0.161-0.286 0.054-0.25 0.268t0.321 0.143 0.25-0.25z"></path>
|
||||||
<path class="path1" d="M13.714 2.286q3.732 0 6.884 1.839t4.991 4.991 1.839 6.884q0 4.482-2.616 8.063t-6.759 4.955q-0.482 0.089-0.714-0.125t-0.232-0.536q0-0.054 0.009-1.366t0.009-2.402q0-1.732-0.929-2.536 1.018-0.107 1.83-0.321t1.679-0.696 1.446-1.188 0.946-1.875 0.366-2.688q0-2.125-1.411-3.679 0.661-1.625-0.143-3.643-0.5-0.161-1.446 0.196t-1.643 0.786l-0.679 0.429q-1.661-0.464-3.429-0.464t-3.429 0.464q-0.286-0.196-0.759-0.482t-1.491-0.688-1.518-0.241q-0.804 2.018-0.143 3.643-1.411 1.554-1.411 3.679 0 1.518 0.366 2.679t0.938 1.875 1.438 1.196 1.679 0.696 1.83 0.321q-0.696 0.643-0.875 1.839-0.375 0.179-0.804 0.268t-1.018 0.089-1.17-0.384-0.991-1.116q-0.339-0.571-0.866-0.929t-0.884-0.429l-0.357-0.054q-0.375 0-0.518 0.080t-0.089 0.205 0.161 0.25 0.232 0.214l0.125 0.089q0.393 0.179 0.777 0.679t0.563 0.911l0.179 0.411q0.232 0.679 0.786 1.098t1.196 0.536 1.241 0.125 0.991-0.063l0.411-0.071q0 0.679 0.009 1.58t0.009 0.973q0 0.321-0.232 0.536t-0.714 0.125q-4.143-1.375-6.759-4.955t-2.616-8.063q0-3.732 1.839-6.884t4.991-4.991 6.884-1.839zM5.196 21.982q0.054-0.125-0.125-0.214-0.179-0.054-0.232 0.036-0.054 0.125 0.125 0.214 0.161 0.107 0.232-0.036zM5.75 22.589q0.125-0.089-0.036-0.286-0.179-0.161-0.286-0.054-0.125 0.089 0.036 0.286 0.179 0.179 0.286 0.054zM6.286 23.393q0.161-0.125 0-0.339-0.143-0.232-0.304-0.107-0.161 0.089 0 0.321t0.304 0.125zM7.036 24.143q0.143-0.143-0.071-0.339-0.214-0.214-0.357-0.054-0.161 0.143 0.071 0.339 0.214 0.214 0.357 0.054zM8.054 24.589q0.054-0.196-0.232-0.286-0.268-0.071-0.339 0.125t0.232 0.268q0.268 0.107 0.339-0.107zM9.179 24.679q0-0.232-0.304-0.196-0.286 0-0.286 0.196 0 0.232 0.304 0.196 0.286 0 0.286-0.196zM10.214 24.5q-0.036-0.196-0.321-0.161-0.286 0.054-0.25 0.268t0.321 0.143 0.25-0.25z"></path>
|
|
||||||
</symbol>
|
</symbol>
|
||||||
<symbol id="icon-code" viewBox="0 0 20 20">
|
<symbol id="code" viewBox="0 0 20 20">
|
||||||
<title>code</title>
|
<path d="M5.719 14.75c-0.236 0-0.474-0.083-0.664-0.252l-5.060-4.498 5.341-4.748c0.412-0.365 1.044-0.33 1.411 0.083s0.33 1.045-0.083 1.412l-3.659 3.253 3.378 3.002c0.413 0.367 0.45 0.999 0.083 1.412-0.197 0.223-0.472 0.336-0.747 0.336zM14.664 14.748l5.341-4.748-5.060-4.498c-0.413-0.367-1.045-0.33-1.411 0.083s-0.33 1.045 0.083 1.412l3.378 3.003-3.659 3.252c-0.413 0.367-0.45 0.999-0.083 1.412 0.197 0.223 0.472 0.336 0.747 0.336 0.236 0 0.474-0.083 0.664-0.252zM9.986 16.165l2-12c0.091-0.545-0.277-1.060-0.822-1.151-0.547-0.092-1.061 0.277-1.15 0.822l-2 12c-0.091 0.545 0.277 1.060 0.822 1.151 0.056 0.009 0.11 0.013 0.165 0.013 0.48 0 0.904-0.347 0.985-0.835z"></path>
|
||||||
<path class="path1" d="M5.719 14.75c-0.236 0-0.474-0.083-0.664-0.252l-5.060-4.498 5.341-4.748c0.412-0.365 1.044-0.33 1.411 0.083s0.33 1.045-0.083 1.412l-3.659 3.253 3.378 3.002c0.413 0.367 0.45 0.999 0.083 1.412-0.197 0.223-0.472 0.336-0.747 0.336zM14.664 14.748l5.341-4.748-5.060-4.498c-0.413-0.367-1.045-0.33-1.411 0.083s-0.33 1.045 0.083 1.412l3.378 3.003-3.659 3.252c-0.413 0.367-0.45 0.999-0.083 1.412 0.197 0.223 0.472 0.336 0.747 0.336 0.236 0 0.474-0.083 0.664-0.252zM9.986 16.165l2-12c0.091-0.545-0.277-1.060-0.822-1.151-0.547-0.092-1.061 0.277-1.15 0.822l-2 12c-0.091 0.545 0.277 1.060 0.822 1.151 0.056 0.009 0.11 0.013 0.165 0.013 0.48 0 0.904-0.347 0.985-0.835z"></path>
|
|
||||||
</symbol>
|
</symbol>
|
||||||
<symbol id="icon-anchor" viewBox="0 0 16 16">
|
<symbol id="anchor" viewBox="0 0 16 16">
|
||||||
<title>anchor</title>
|
<path d="M14.779 12.779c-1.471 1.993-4.031 3.245-6.779 3.221-2.748 0.023-5.309-1.229-6.779-3.221l-1.221 1.221v-4h4l-1.1 1.099c0.882 1.46 2.357 2.509 4.1 2.807v-6.047c-1.723-0.446-3-1.997-3-3.858 0-2.209 1.791-4 4-4s4 1.791 4 4c0 1.862-1.277 3.413-3 3.858v6.047c1.742-0.297 3.218-1.347 4.099-2.807l-1.1-1.099h4v4l-1.221-1.221zM10 4c0-1.104-0.895-2-2-2s-2 0.895-2 2c0 1.104 0.895 2 2 2s2-0.896 2-2z"></path>
|
||||||
<path class="path1" d="M14.779 12.779c-1.471 1.993-4.031 3.245-6.779 3.221-2.748 0.023-5.309-1.229-6.779-3.221l-1.221 1.221v-4h4l-1.1 1.099c0.882 1.46 2.357 2.509 4.1 2.807v-6.047c-1.723-0.446-3-1.997-3-3.858 0-2.209 1.791-4 4-4s4 1.791 4 4c0 1.862-1.277 3.413-3 3.858v6.047c1.742-0.297 3.218-1.347 4.099-2.807l-1.1-1.099h4v4l-1.221-1.221zM10 4c0-1.104-0.895-2-2-2s-2 0.895-2 2c0 1.104 0.895 2 2 2s2-0.896 2-2z"></path>
|
|
||||||
</symbol>
|
</symbol>
|
||||||
<symbol id="icon-book" viewBox="0 0 24 24">
|
<symbol id="book" viewBox="0 0 24 24">
|
||||||
<title>book</title>
|
<path d="M18.984 6.984v-1.969h-9.984v1.969h9.984zM15 15v-2.016h-6v2.016h6zM18.984 11.016v-2.016h-9.984v2.016h9.984zM20.016 2.016c1.078 0 1.969 0.891 1.969 1.969v12c0 1.078-0.891 2.016-1.969 2.016h-12c-1.078 0-2.016-0.938-2.016-2.016v-12c0-1.078 0.938-1.969 2.016-1.969h12zM3.984 6v14.016h14.016v1.969h-14.016c-1.078 0-1.969-0.891-1.969-1.969v-14.016h1.969z"></path>
|
||||||
<path class="path1" d="M18.984 6.984v-1.969h-9.984v1.969h9.984zM15 15v-2.016h-6v2.016h6zM18.984 11.016v-2.016h-9.984v2.016h9.984zM20.016 2.016c1.078 0 1.969 0.891 1.969 1.969v12c0 1.078-0.891 2.016-1.969 2.016h-12c-1.078 0-2.016-0.938-2.016-2.016v-12c0-1.078 0.938-1.969 2.016-1.969h12zM3.984 6v14.016h14.016v1.969h-14.016c-1.078 0-1.969-0.891-1.969-1.969v-14.016h1.969z"></path>
|
|
||||||
</symbol>
|
</symbol>
|
||||||
<symbol id="icon-pro" viewBox="0 0 20 20">
|
<symbol id="pro" viewBox="0 0 20 20">
|
||||||
<title>pro</title>
|
<path d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-4v4h-2v-4h-4v-2h4v-4h2v4h4v2z"></path>
|
||||||
<path class="path1" d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-4v4h-2v-4h-4v-2h4v-4h2v4h4v2z"></path>
|
|
||||||
</symbol>
|
</symbol>
|
||||||
<symbol id="icon-con" viewBox="0 0 20 20">
|
<symbol id="con" viewBox="0 0 20 20">
|
||||||
<title>con</title>
|
<path d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-10v-2h10v2z"></path>
|
||||||
<path class="path1" d="M10 1.6c-4.639 0-8.4 3.761-8.4 8.4s3.761 8.4 8.4 8.4 8.4-3.761 8.4-8.4c0-4.639-3.761-8.4-8.4-8.4zM15 11h-10v-2h10v2z"></path>
|
|
||||||
</symbol>
|
</symbol>
|
||||||
<symbol id="icon-neutral" viewBox="0 0 20 20">
|
<symbol id="neutral" viewBox="0 0 20 20">
|
||||||
<title>neutral</title>
|
<path d="M9.999 0.8c-5.081 0-9.199 4.119-9.199 9.201 0 5.080 4.118 9.199 9.199 9.199s9.2-4.119 9.2-9.199c0-5.082-4.119-9.201-9.2-9.201zM10 13.001c-1.657 0-3-1.344-3-3s1.343-3 3-3c1.656 0 3 1.344 3 3s-1.344 3-3 3z"></path>
|
||||||
<path class="path1" d="M9.999 0.8c-5.081 0-9.199 4.119-9.199 9.201 0 5.080 4.118 9.199 9.199 9.199s9.2-4.119 9.2-9.199c0-5.082-4.119-9.201-9.2-9.201zM10 13.001c-1.657 0-3-1.344-3-3s1.343-3 3-3c1.656 0 3 1.344 3 3s-1.344 3-3 3z"></path>
|
</symbol>
|
||||||
|
<symbol id="chat" viewBox="0 0 24 24">
|
||||||
|
<path d="M18 8.016v-2.016h-12v2.016h12zM18 11.016v-2.016h-12v2.016h12zM18 14.016v-2.016h-12v2.016h12zM21.984 3.984v18l-3.984-3.984h-14.016c-1.078 0-1.969-0.938-1.969-2.016v-12c0-1.078 0.891-1.969 1.969-1.969h16.031c1.078 0 1.969 0.891 1.969 1.969z"></path>
|
||||||
</symbol>
|
</symbol>
|
||||||
</defs>
|
</defs>
|
||||||
</svg>
|
</svg>
|
||||||
|
|
Before Width: | Height: | Size: 4.7 KiB After Width: | Height: | Size: 4.6 KiB |
|
@ -23,7 +23,7 @@ p
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell Multi-language support
|
+cell Multi-language support
|
||||||
each icon in [ "con", "pro", "pro", "pro" ]
|
each icon in [ "neutral", "pro", "pro", "pro" ]
|
||||||
+cell.u-text-center #[+procon(icon)]
|
+cell.u-text-center #[+procon(icon)]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
|
|
|
@ -2,8 +2,6 @@
|
||||||
|
|
||||||
include ../_includes/_mixins
|
include ../_includes/_mixins
|
||||||
|
|
||||||
p=lorem_short
|
|
||||||
|
|
||||||
+aside("Help us improve the docs")
|
+aside("Help us improve the docs")
|
||||||
| Did you spot a mistake or come across explanations that
|
| Did you spot a mistake or come across explanations that
|
||||||
| are unclear? You can find a "Suggest edits" button at the
|
| are unclear? You can find a "Suggest edits" button at the
|
||||||
|
|
|
@ -57,7 +57,7 @@ p
|
||||||
doc.ents = [Span(0, 1, label='GPE')]
|
doc.ents = [Span(0, 1, label='GPE')]
|
||||||
assert doc[0].ent_type_ == 'GPE'
|
assert doc[0].ent_type_ == 'GPE'
|
||||||
doc.ents = []
|
doc.ents = []
|
||||||
doc.ents = [(u'LondonCity', 0, 1, u'GPE')]
|
doc.ents = [(u'LondonCity', u'GPE', 0, 1)]
|
||||||
|
|
||||||
p
|
p
|
||||||
| The value you assign should be a sequence, the values of which
|
| The value you assign should be a sequence, the values of which
|
||||||
|
|
|
@ -30,6 +30,13 @@ p Many of the associated tools and resources that we're developing alongside spa
|
||||||
+cell
|
+cell
|
||||||
| REST microservices for spaCy demos and visualisers.
|
| REST microservices for spaCy demos and visualisers.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell
|
||||||
|
+src(gh("spacy-notebooks")) spaCy Notebooks
|
||||||
|
|
||||||
|
+cell
|
||||||
|
| Jupyter notebooks for spaCy examples and tutorials.
|
||||||
|
|
||||||
+h(2, "libraries") Libraries and projects
|
+h(2, "libraries") Libraries and projects
|
||||||
+table(["Name", "Description"])
|
+table(["Name", "Description"])
|
||||||
+row
|
+row
|
||||||
|
|
|
@ -141,7 +141,7 @@ p
|
||||||
span.merge(label=label, tag='NNP' if label else span.root.tag_)
|
span.merge(label=label, tag='NNP' if label else span.root.tag_)
|
||||||
|
|
||||||
matcher.add_entity('GoogleNow', on_match=merge_phrases)
|
matcher.add_entity('GoogleNow', on_match=merge_phrases)
|
||||||
matcher.add_pattern('GoogleNow', {ORTH: 'Google'}, {ORTH: 'Now'}])
|
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
|
||||||
doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded'])
|
doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded'])
|
||||||
matcher(doc)
|
matcher(doc)
|
||||||
print([w.text for w in doc])
|
print([w.text for w in doc])
|
||||||
|
|
Loading…
Reference in New Issue
Block a user