mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Fixed Hungarian tokenizer for numbers
This commit is contained in:
parent
b438cfddbc
commit
1be5da1ac6
|
@ -1,6 +1,7 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from spacy.hu.tokenizer_exceptions import TOKEN_MATCH
|
||||
from .language_data import *
|
||||
from ..attrs import LANG
|
||||
from ..language import Language
|
||||
|
@ -21,3 +22,5 @@ class Hungarian(Language):
|
|||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
token_match = TOKEN_MATCH
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import six
|
||||
|
||||
from spacy.language_data import strings_to_exc, update_exc
|
||||
from .punctuation import *
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -10,19 +8,15 @@ from .tokenizer_exceptions import ABBREVIATIONS
|
|||
from .tokenizer_exceptions import OTHER_EXC
|
||||
from .. import language_data as base
|
||||
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
|
||||
|
||||
|
||||
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES
|
||||
TOKENIZER_SUFFIXES = base.TOKENIZER_SUFFIXES + TOKENIZER_SUFFIXES
|
||||
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
|
||||
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
||||
|
|
|
@ -1,25 +1,35 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..language_data.punctuation import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES
|
||||
from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPHA_UPPER, LIST_QUOTES, UNITS, \
|
||||
CURRENCY, LIST_PUNCT, ALPHA
|
||||
|
||||
CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿"
|
||||
|
||||
TOKENIZER_SUFFIXES = [
|
||||
TOKENIZER_SUFFIXES = (
|
||||
LIST_PUNCT +
|
||||
LIST_ELLIPSES +
|
||||
LIST_QUOTES +
|
||||
[
|
||||
r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
|
||||
r'(?<=[0-9])(?:{u})'.format(u=UNITS),
|
||||
r'(?<=[{al}{p}{c}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES, c=CURRENCY_SYMBOLS),
|
||||
r'(?<=[{al})])-e'.format(al=ALPHA_LOWER)
|
||||
]
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_INFIXES = [
|
||||
r'(?<=[0-9])-(?=[0-9])',
|
||||
r'(?<=[0-9])[+\-\*/^](?=[0-9])',
|
||||
r'(?<=[{a}])--(?=[{a}])',
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
TOKENIZER_INFIXES = (
|
||||
LIST_ELLIPSES +
|
||||
[
|
||||
r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r'(?<=[0-9{a}])"(?=[\-{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
|
||||
]
|
||||
|
||||
|
||||
TOKENIZER_INFIXES += LIST_ELLIPSES
|
||||
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[0-9{a}])({q})(?=[\-{a}])'.format(a=ALPHA, q=QUOTES),
|
||||
]
|
||||
)
|
||||
|
||||
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from spacy.language_data.punctuation import ALPHA_LOWER, CURRENCY
|
||||
from ..language_data.tokenizer_exceptions import URL_PATTERN
|
||||
|
||||
ABBREVIATIONS = """
|
||||
AkH.
|
||||
Aö.
|
||||
|
@ -107,6 +112,7 @@ Tel.
|
|||
Ty.
|
||||
Tyr.
|
||||
Ui.
|
||||
Ut.
|
||||
Vcs.
|
||||
Vhr.
|
||||
X.Y.
|
||||
|
@ -212,6 +218,7 @@ gimn.
|
|||
gk.
|
||||
gkv.
|
||||
gondn.
|
||||
Gr.
|
||||
gr.
|
||||
grav.
|
||||
gy.
|
||||
|
@ -237,6 +244,7 @@ ht.
|
|||
htb.
|
||||
hv.
|
||||
hőm.
|
||||
ie.
|
||||
i.e.
|
||||
i.sz.
|
||||
id.
|
||||
|
@ -271,6 +279,7 @@ júl.
|
|||
jún.
|
||||
karb.
|
||||
kat.
|
||||
kath.
|
||||
kb.
|
||||
kcs.
|
||||
kd.
|
||||
|
@ -286,6 +295,7 @@ kk.
|
|||
kkt.
|
||||
klin.
|
||||
kp.
|
||||
Kr.
|
||||
krt.
|
||||
kt.
|
||||
ktsg.
|
||||
|
@ -357,6 +367,7 @@ nov.
|
|||
nu.
|
||||
ny.
|
||||
nyilv.
|
||||
Nyrt.
|
||||
nyrt.
|
||||
nyug.
|
||||
obj.
|
||||
|
@ -409,6 +420,7 @@ sa.
|
|||
sel.
|
||||
sgt.
|
||||
sm.
|
||||
St.
|
||||
st.
|
||||
stat.
|
||||
stb.
|
||||
|
@ -478,8 +490,11 @@ vs.
|
|||
vsz.
|
||||
vv.
|
||||
vál.
|
||||
várm.
|
||||
Várm.
|
||||
vízv.
|
||||
vö.
|
||||
Zrt.
|
||||
zrt.
|
||||
zs.
|
||||
Ész.
|
||||
|
@ -502,6 +517,7 @@ zs.
|
|||
ú.
|
||||
úm.
|
||||
ún.
|
||||
ú.n.
|
||||
út.
|
||||
üag.
|
||||
üd.
|
||||
|
@ -510,7 +526,6 @@ zs.
|
|||
ümk.
|
||||
ütk.
|
||||
üv.
|
||||
ő.
|
||||
ű.
|
||||
őrgy.
|
||||
őrpk.
|
||||
|
@ -520,3 +535,17 @@ zs.
|
|||
OTHER_EXC = """
|
||||
-e
|
||||
""".strip().split()
|
||||
|
||||
ORD_NUM_OR_DATE = "([A-Z0-9]+[./-])*(\d+\.?)"
|
||||
_NUM = "[+\-]?\d+([,.]\d+)*"
|
||||
_OPS = "[=<>+\-\*/^()÷%²]"
|
||||
_SUFFIES = "-[{a}]+".format(a=ALPHA_LOWER)
|
||||
NUMERIC_EXP = "\(?({n})(({o})({n}))*[)%]?".format(n=_NUM, o=_OPS)
|
||||
TIME_EXP = "\d+(:\d+)*(\.\d+)?"
|
||||
|
||||
NUMS = "(({ne})|({t})|({on})|({c}))({s})?".format(
|
||||
ne=NUMERIC_EXP, t=TIME_EXP, on=ORD_NUM_OR_DATE,
|
||||
c=CURRENCY, s=_SUFFIES
|
||||
)
|
||||
|
||||
TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=URL_PATTERN, n=NUMS)).match
|
||||
|
|
|
@ -57,14 +57,14 @@ LIST_PUNCT = list(_PUNCT.strip().split())
|
|||
LIST_HYPHENS = list(_HYPHENS.strip().split())
|
||||
|
||||
|
||||
ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '')
|
||||
ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '')
|
||||
ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '').replace('\n', '')
|
||||
ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '').replace(' ', '')
|
||||
ALPHA = ALPHA_LOWER + ALPHA_UPPER
|
||||
|
||||
|
||||
QUOTES = _QUOTES.strip().replace(' ', '|')
|
||||
CURRENCY = _CURRENCY.strip().replace(' ', '|')
|
||||
UNITS = _UNITS.strip().replace(' ', '|')
|
||||
UNITS = _UNITS.strip().replace(' ', '|').replace('\n', '|')
|
||||
HYPHENS = _HYPHENS.strip().replace(' ', '|')
|
||||
|
||||
|
||||
|
|
|
@ -2,10 +2,10 @@ from __future__ import unicode_literals
|
|||
|
||||
import re
|
||||
|
||||
_URL_PATTERN = r'''
|
||||
^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$
|
||||
URL_PATTERN = r'''
|
||||
((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)
|
||||
'''.strip()
|
||||
|
||||
TOKEN_MATCH = re.compile(_URL_PATTERN).match
|
||||
TOKEN_MATCH = re.compile("^{}$".format(URL_PATTERN)).match
|
||||
|
||||
__all__ = ['TOKEN_MATCH']
|
||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
|
||||
|
||||
DEFAULT_TESTS = [
|
||||
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
||||
|
@ -24,11 +23,13 @@ DEFAULT_TESTS = [
|
|||
|
||||
HYPHEN_TESTS = [
|
||||
('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']),
|
||||
('Szabolcs-Szatmár-Bereg megye', ['Szabolcs-Szatmár-Bereg', 'megye']),
|
||||
('Egy -nak.', ['Egy', '-nak', '.']),
|
||||
('Egy bel-.', ['Egy', 'bel-', '.']),
|
||||
('Dinnye-domb-.', ['Dinnye-domb-', '.']),
|
||||
('Ezen -e elcsatangolt.', ['Ezen', '-e', 'elcsatangolt', '.']),
|
||||
('Lakik-e', ['Lakik', '-e']),
|
||||
('A--B', ['A', '--' 'B']),
|
||||
('Lakik-e?', ['Lakik', '-e', '?']),
|
||||
('Lakik-e.', ['Lakik', '-e', '.']),
|
||||
('Lakik-e...', ['Lakik', '-e', '...']),
|
||||
|
@ -89,11 +90,15 @@ NUMBER_TESTS = [
|
|||
('A -23,12 van.', ['A', '-23,12', 'van', '.']),
|
||||
('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']),
|
||||
('A -23,12-ben.', ['A', '-23,12-ben', '.']),
|
||||
('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2+3 van.', ['A', '2+3', 'van', '.']),
|
||||
('A 2<3 van.', ['A', '2<3', 'van', '.']),
|
||||
('A 2=3 van.', ['A', '2=3', 'van', '.']),
|
||||
('A 2÷3 van.', ['A', '2÷3', 'van', '.']),
|
||||
('A (2÷3)-2/5=1 van.', ['A', '(2÷3)-2/5=1', 'van', '.']),
|
||||
('A 2 +3 van.', ['A', '2', '+3', 'van', '.']),
|
||||
('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A 2*3 van.', ['A', '2*3', 'van', '.']),
|
||||
('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
|
@ -142,6 +147,7 @@ NUMBER_TESTS = [
|
|||
('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']),
|
||||
('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']),
|
||||
('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']),
|
||||
('A +0,99% van.', ['A', '+0,99%', 'van', '.']),
|
||||
('A -0,99% van.', ['A', '-0,99%', 'van', '.']),
|
||||
('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']),
|
||||
('A -0,99%.', ['A', '-0,99%', '.']),
|
||||
|
@ -194,7 +200,16 @@ NUMBER_TESTS = [
|
|||
('A III/c-ben.', ['A', 'III/c-ben', '.']),
|
||||
('A TU–154 van.', ['A', 'TU–154', 'van', '.']),
|
||||
('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']),
|
||||
('A TU–154-ben.', ['A', 'TU–154-ben', '.'])
|
||||
('A TU–154-ben.', ['A', 'TU–154-ben', '.']),
|
||||
('A 5cm³', ['A', '5', 'cm³']),
|
||||
('A 5 $-ban', ['A', '5', '$-ban']),
|
||||
('A 5$-ban', ['A', '5$-ban']),
|
||||
('A 5$.', ['A', '5', '$', '.']),
|
||||
('A 5$', ['A', '5', '$']),
|
||||
('A $5', ['A', '$', '5']),
|
||||
('A 5km/h', ['A', '5', 'km/h']),
|
||||
('A 75%+1-100%-ig', ['A', '75%+1-100%-ig']),
|
||||
('A 5km/h.', ['A', '5', 'km/h', '.']),
|
||||
]
|
||||
|
||||
QUOTE_TESTS = [
|
||||
|
@ -202,15 +217,15 @@ QUOTE_TESTS = [
|
|||
('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
|
||||
('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']),
|
||||
('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']),
|
||||
("A don't van.", ['A', "don't", 'van', '.'])
|
||||
# ("A don't van.", ['A', "don't", 'van', '.'])
|
||||
]
|
||||
|
||||
DOT_TESTS = [
|
||||
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
||||
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
||||
('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']),
|
||||
('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']),
|
||||
('A .hu.', ['A', '.hu', '.']),
|
||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||
('A pl.', ['A', 'pl.']),
|
||||
|
@ -223,9 +238,16 @@ DOT_TESTS = [
|
|||
('Valami ... más.', ['Valami', '...', 'más', '.'])
|
||||
]
|
||||
|
||||
WIKI_TESTS = [
|
||||
('!"', ['!', '"']),
|
||||
('!"-lel', ['!', '"', '-lel']),
|
||||
('""-sorozat ', ['"', '"', '-sorozat']),
|
||||
('"(Köszönöm', ['"', '(', 'Köszönöm']),
|
||||
('(törvénykönyv)-ben ', ['(', 'törvénykönyv', ')', '-ben']),
|
||||
('"(...)"–sokkal ', ['"', '(', '...', ')', '"', '–sokkal']),
|
||||
]
|
||||
|
||||
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS # + NUMBER_TESTS + HYPHEN_TESTS
|
||||
|
||||
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS # + HYPHEN_TESTS # + WIKI_TESTS
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||
def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):
|
||||
|
|
Loading…
Reference in New Issue
Block a user