diff --git a/.github/contributors/oroszgy.md b/.github/contributors/oroszgy.md new file mode 100644 index 000000000..8e69b407e --- /dev/null +++ b/.github/contributors/oroszgy.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | György Orosz | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2016-12-26 | +| GitHub username | oroszgy | +| Website (optional) | gyorgy.orosz.link | diff --git a/setup.py b/setup.py index 109a49cb1..2a1d56a5e 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,7 @@ PACKAGES = [ 'spacy.es', 'spacy.fr', 'spacy.it', + 'spacy.hu', 'spacy.pt', 'spacy.nl', 'spacy.sv', diff --git a/spacy/__init__.py b/spacy/__init__.py index 3820973a0..21e0f7db4 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -8,6 +8,7 @@ from . import de from . import zh from . import es from . import it +from . import hu from . import fr from . import pt from . import nl @@ -26,6 +27,7 @@ set_lang_class(es.Spanish.lang, es.Spanish) set_lang_class(pt.Portuguese.lang, pt.Portuguese) set_lang_class(fr.French.lang, fr.French) set_lang_class(it.Italian.lang, it.Italian) +set_lang_class(hu.Hungarian.lang, hu.Hungarian) set_lang_class(zh.Chinese.lang, zh.Chinese) set_lang_class(nl.Dutch.lang, nl.Dutch) set_lang_class(sv.Swedish.lang, sv.Swedish) diff --git a/spacy/hu/__init__.py b/spacy/hu/__init__.py new file mode 100644 index 000000000..2343b4606 --- /dev/null +++ b/spacy/hu/__init__.py @@ -0,0 +1,23 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from .language_data import * +from ..attrs import LANG +from ..language import Language + + +class Hungarian(Language): + lang = 'hu' + + class Defaults(Language.Defaults): + tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'hu' + + prefixes = tuple(TOKENIZER_PREFIXES) + + suffixes = tuple(TOKENIZER_SUFFIXES) + + infixes = tuple(TOKENIZER_INFIXES) + + stop_words = set(STOP_WORDS) diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py new file mode 100644 index 000000000..94eeb6f4d --- /dev/null +++ b/spacy/hu/language_data.py @@ -0,0 +1,24 @@ +# encoding: utf8 +from __future__ import unicode_literals + +import six + +from spacy.language_data import strings_to_exc, update_exc +from .punctuations import * +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import ABBREVIATIONS +from .tokenizer_exceptions import OTHER_EXC +from .. import language_data as base + +STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES +TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES +TOKENIZER_INFIXES = TOKENIZER_INFIXES + +# HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]] + +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS)) + +__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/punctuations.py b/spacy/hu/punctuations.py new file mode 100644 index 000000000..3681a2fbe --- /dev/null +++ b/spacy/hu/punctuations.py @@ -0,0 +1,89 @@ +# encoding: utf8 +from __future__ import unicode_literals + +TOKENIZER_PREFIXES = r''' ++ +'''.strip().split('\n') + +TOKENIZER_SUFFIXES = r''' +, +\" +\) +\] +\} +\* +\! +\? +\$ +> +: +; +' +” +“ +« +_ +'' +’ +‘ +€ +\.\. +\.\.\. +\.\.\.\. +(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”+-])\. +(?<=[a-züóőúéáűí)])-e +\-\- +´ +(?<=[0-9])\+ +(?<=[a-z0-9üóőúéáűí][\)\]”"'%\)§/])\. +(?<=[0-9])km² +(?<=[0-9])m² +(?<=[0-9])cm² +(?<=[0-9])mm² +(?<=[0-9])km³ +(?<=[0-9])m³ +(?<=[0-9])cm³ +(?<=[0-9])mm³ +(?<=[0-9])ha +(?<=[0-9])km +(?<=[0-9])m +(?<=[0-9])cm +(?<=[0-9])mm +(?<=[0-9])µm +(?<=[0-9])nm +(?<=[0-9])yd +(?<=[0-9])in +(?<=[0-9])ft +(?<=[0-9])kg +(?<=[0-9])g +(?<=[0-9])mg +(?<=[0-9])µg +(?<=[0-9])t +(?<=[0-9])lb +(?<=[0-9])oz +(?<=[0-9])m/s +(?<=[0-9])km/h +(?<=[0-9])mph +(?<=°[FCK])\. +(?<=[0-9])hPa +(?<=[0-9])Pa +(?<=[0-9])mbar +(?<=[0-9])mb +(?<=[0-9])T +(?<=[0-9])G +(?<=[0-9])M +(?<=[0-9])K +(?<=[0-9])kb +'''.strip().split('\n') + +TOKENIZER_INFIXES = r''' +… +\.\.+ +(?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ]) +(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) +(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) +(?<=[0-9])[+\-\*/^](?=[0-9]) +(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) +'''.strip().split('\n') + +__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/stop_words.py b/spacy/hu/stop_words.py new file mode 100644 index 000000000..aad992e6e --- /dev/null +++ b/spacy/hu/stop_words.py @@ -0,0 +1,64 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben +amelyeket amelyet amelynek ami amikor amit amolyan amíg annak arra arról az +azok azon azonban azt aztán azután azzal azért + +be belül benne bár + +cikk cikkek cikkeket csak + +de + +e ebben eddig egy egyes egyetlen egyik egyre egyéb egész ehhez ekkor el ellen +elo eloször elott elso elég előtt emilyen ennek erre ez ezek ezen ezt ezzel +ezért + +fel felé + +ha hanem hiszen hogy hogyan hát + +ide igen ill ill. illetve ilyen ilyenkor inkább is ismét ison itt + +jobban jó jól + +kell kellett keressünk keresztül ki kívül között közül + +le legalább legyen lehet lehetett lenne lenni lesz lett + +ma maga magát majd meg mellett mely melyek mert mi miatt mikor milyen minden +mindenki mindent mindig mint mintha mit mivel miért mondta most már más másik +még míg + +nagy nagyobb nagyon ne nekem neki nem nincs néha néhány nélkül + +o oda ok oket olyan ott + +pedig persze például + +rá + +s saját sem semmi sok sokat sokkal stb. szemben szerint szinte számára szét + +talán te tehát teljes ti tovább továbbá több túl ugyanis + +utolsó után utána + +vagy vagyis vagyok valaki valami valamint való van vannak vele vissza viszont +volna volt voltak voltam voltunk + +által általában át + +én éppen és + +így + +ön össze + +úgy új újabb újra + +ő őket +""".split()) diff --git a/spacy/hu/tokenizer_exceptions.py b/spacy/hu/tokenizer_exceptions.py new file mode 100644 index 000000000..627035bb8 --- /dev/null +++ b/spacy/hu/tokenizer_exceptions.py @@ -0,0 +1,549 @@ +# encoding: utf8 +from __future__ import unicode_literals + +ABBREVIATIONS = """ +AkH. +Aö. +B.CS. +B.S. +B.Sc. +B.ú.é.k. +BE. +BEK. +BSC. +BSc. +BTK. +Be. +Bek. +Bfok. +Bk. +Bp. +Btk. +Btke. +Btét. +CSC. +Cal. +Co. +Colo. +Comp. +Copr. +Cs. +Csc. +Csop. +Ctv. +D. +DR. +Dipl. +Dr. +Dsz. +Dzs. +Fla. +Főszerk. +GM. +Gy. +HKsz. +Hmvh. +Inform. +K.m.f. +KER. +KFT. +KRT. +Ker. +Kft. +Kong. +Korm. +Kr. +Kr.e. +Kr.u. +Krt. +M.A. +M.S. +M.SC. +M.Sc. +MA. +MSC. +MSc. +Mass. +Mlle. +Mme. +Mo. +Mr. +Mrs. +Ms. +Mt. +N.N. +NB. +NBr. +Nat. +Nr. +Ny. +Nyh. +Nyr. +Op. +P.H. +P.S. +PH.D. +PHD. +PROF. +Ph.D +PhD. +Pp. +Proc. +Prof. +Ptk. +Rer. +S.B. +SZOLG. +Salg. +St. +Sz. +Szfv. +Szjt. +Szolg. +Szt. +Sztv. +TEL. +Tel. +Ty. +Tyr. +Ui. +Vcs. +Vhr. +X.Y. +Zs. +a. +a.C. +ac. +adj. +adm. +ag. +agit. +alez. +alk. +altbgy. +an. +ang. +arch. +at. +aug. +b. +b.a. +b.s. +b.sc. +bek. +belker. +berend. +biz. +bizt. +bo. +bp. +br. +bsc. +bt. +btk. +c. +ca. +cc. +cca. +cf. +cif. +co. +corp. +cos. +cs. +csc. +csüt. +cső. +ctv. +d. +dbj. +dd. +ddr. +de. +dec. +dikt. +dipl. +dj. +dk. +dny. +dolg. +dr. +du. +dzs. +e. +ea. +ed. +eff. +egyh. +ell. +elv. +elvt. +em. +eng. +eny. +et. +etc. +ev. +ezr. +eü. +f. +f.h. +f.é. +fam. +febr. +fej. +felv. +felügy. +ff. +ffi. +fhdgy. +fil. +fiz. +fm. +foglalk. +ford. +fp. +fr. +frsz. +fszla. +fszt. +ft. +fuv. +főig. +főisk. +főtörm. +főv. +g. +gazd. +gimn. +gk. +gkv. +gondn. +gr. +grav. +gy. +gyak. +gyártm. +gör. +h. +hads. +hallg. +hdm. +hdp. +hds. +hg. +hiv. +hk. +hm. +ho. +honv. +hp. +hr. +hrsz. +hsz. +ht. +htb. +hv. +hőm. +i.e. +i.sz. +id. +ifj. +ig. +igh. +ill. +imp. +inc. +ind. +inform. +inic. +int. +io. +ip. +ir. +irod. +isk. +ism. +izr. +iá. +j. +jan. +jav. +jegyz. +jjv. +jkv. +jogh. +jogt. +jr. +jvb. +júl. +jún. +k. +karb. +kat. +kb. +kcs. +kd. +ker. +kf. +kft. +kht. +kir. +kirend. +kisip. +kiv. +kk. +kkt. +klin. +kp. +krt. +kt. +ktsg. +kult. +kv. +kve. +képv. +kísérl. +kóth. +könyvt. +körz. +köv. +közj. +közl. +közp. +közt. +kü. +l. +lat. +ld. +legs. +lg. +lgv. +loc. +lt. +ltd. +ltp. +luth. +m. +m.a. +m.s. +m.sc. +ma. +mat. +mb. +med. +megh. +met. +mf. +mfszt. +min. +miss. +mjr. +mjv. +mk. +mlle. +mme. +mn. +mozg. +mr. +mrs. +ms. +msc. +má. +máj. +márc. +mé. +mélt. +mü. +műh. +műsz. +műv. +művez. +n. +nagyker. +nagys. +nat. +nb. +neg. +nk. +nov. +nu. +ny. +nyilv. +nyrt. +nyug. +o. +obj. +okl. +okt. +olv. +orsz. +ort. +ov. +ovh. +p. +pf. +pg. +ph.d +ph.d. +phd. +pk. +pl. +plb. +plc. +pld. +plur. +pol. +polg. +poz. +pp. +proc. +prof. +prot. +pság. +ptk. +pu. +pü. +q. +r. +r.k. +rac. +rad. +red. +ref. +reg. +rer. +rev. +rf. +rkp. +rkt. +rt. +rtg. +röv. +s. +s.b. +s.k. +sa. +sel. +sgt. +sm. +st. +stat. +stb. +strat. +sz. +szakm. +szaksz. +szakszerv. +szd. +szds. +szept. +szerk. +szf. +szimf. +szjt. +szkv. +szla. +szn. +szolg. +szt. +szubj. +szöv. +szül. +t. +tanm. +tb. +tbk. +tc. +techn. +tek. +tel. +tf. +tgk. +ti. +tip. +tisztv. +titks. +tk. +tkp. +tny. +tp. +tszf. +tszk. +tszkv. +tv. +tvr. +ty. +törv. +tü. +u. +ua. +ui. +unit. +uo. +uv. +v. +vas. +vb. +vegy. +vh. +vhol. +vill. +vizsg. +vk. +vkf. +vkny. +vm. +vol. +vs. +vsz. +vv. +vál. +vízv. +vö. +w. +y. +z. +zrt. +zs. +Ész. +Új-Z. +ÚjZ. +á. +ált. +ápr. +ásv. +é. +ék. +ény. +érk. +évf. +í. +ó. +ö. +össz. +ötk. +özv. +ú. +úm. +ún. +út. +ü. +üag. +üd. +üdv. +üe. +ümk. +ütk. +üv. +ő. +ű. +őrgy. +őrpk. +őrv. +""".strip().split() + +OTHER_EXC = """ +'' +-e +""".strip().split() diff --git a/spacy/tests/hu/__init__.py b/spacy/tests/hu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/hu/tokenizer/__init__.py b/spacy/tests/hu/tokenizer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/hu/tokenizer/test_tokenizer.py b/spacy/tests/hu/tokenizer/test_tokenizer.py new file mode 100644 index 000000000..2bfbfdf36 --- /dev/null +++ b/spacy/tests/hu/tokenizer/test_tokenizer.py @@ -0,0 +1,233 @@ +# encoding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.hu import Hungarian + +_DEFAULT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), + ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), + ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), + ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), + ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), + ('A .hu.', ['A', '.hu', '.']), + ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), + ('A pl.', ['A', 'pl.']), + ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), + ('Egy..ket.', ['Egy', '..', 'ket', '.']), + ('Valami... van.', ['Valami', '...', 'van', '.']), + ('Valami ...van...', ['Valami', '...', 'van', '...']), + ('Valami...', ['Valami', '...']), + ('Valami ...', ['Valami', '...']), + ('Valami ... más.', ['Valami', '...', 'más', '.'])] + +_HYPHEN_TESTS = [ + ('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']), + ('Egy -nak.', ['Egy', '-nak', '.']), + ('Egy bel-.', ['Egy', 'bel-', '.']), + ('Dinnye-domb-.', ['Dinnye-domb-', '.']), + ('Ezen -e elcsatangolt.', ['Ezen', '-e', 'elcsatangolt', '.']), + ('Lakik-e', ['Lakik', '-e']), + ('Lakik-e?', ['Lakik', '-e', '?']), + ('Lakik-e.', ['Lakik', '-e', '.']), + ('Lakik-e...', ['Lakik', '-e', '...']), + ('Lakik-e... van.', ['Lakik', '-e', '...', 'van', '.']), + ('Lakik-e van?', ['Lakik', '-e', 'van', '?']), + ('Lakik-elem van?', ['Lakik-elem', 'van', '?']), + ('Van lakik-elem.', ['Van', 'lakik-elem', '.']), + ('A 7-es busz?', ['A', '7-es', 'busz', '?']), + ('A 7-es?', ['A', '7-es', '?']), + ('A 7-es.', ['A', '7-es', '.']), + ('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']), + ('A %-sal.', ['A', '%-sal', '.']), + ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])] + +_NUMBER_TESTS = [('A 2b van.', ['A', '2b', 'van', '.']), + ('A 2b-ben van.', ['A', '2b-ben', 'van', '.']), + ('A 2b.', ['A', '2b', '.']), + ('A 2b-ben.', ['A', '2b-ben', '.']), + ('A 3.b van.', ['A', '3.b', 'van', '.']), + ('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']), + ('A 3.b.', ['A', '3.b', '.']), + ('A 3.b-ben.', ['A', '3.b-ben', '.']), + ('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']), + ('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']), + ('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']), + ('A 1:35 van.', ['A', '1:35', 'van', '.']), + ('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']), + ('A 1:35-ben.', ['A', '1:35-ben', '.']), + ('A 1.35 van.', ['A', '1.35', 'van', '.']), + ('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']), + ('A 1.35-ben.', ['A', '1.35-ben', '.']), + ('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']), + ('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']), + ('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']), + ('A 10--12 van.', ['A', '10--12', 'van', '.']), + ('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']), + ('A 10--12-ben.', ['A', '10--12-ben', '.']), + ('A 10‐12 van.', ['A', '10‐12', 'van', '.']), + ('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']), + ('A 10‐12-ben.', ['A', '10‐12-ben', '.']), + ('A 10‑12 van.', ['A', '10‑12', 'van', '.']), + ('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']), + ('A 10‑12-ben.', ['A', '10‑12-ben', '.']), + ('A 10‒12 van.', ['A', '10‒12', 'van', '.']), + ('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']), + ('A 10‒12-ben.', ['A', '10‒12-ben', '.']), + ('A 10–12 van.', ['A', '10–12', 'van', '.']), + ('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']), + ('A 10–12-ben.', ['A', '10–12-ben', '.']), + ('A 10—12 van.', ['A', '10—12', 'van', '.']), + ('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']), + ('A 10—12-ben.', ['A', '10—12-ben', '.']), + ('A 10―12 van.', ['A', '10―12', 'van', '.']), + ('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']), + ('A 10―12-ben.', ['A', '10―12-ben', '.']), + ('A -23,12 van.', ['A', '-23,12', 'van', '.']), + ('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']), + ('A -23,12-ben.', ['A', '-23,12-ben', '.']), + ('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']), + ('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']), + ('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']), + ('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']), + ('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']), + ('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']), + ('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']), + ('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']), + ('A C++ van.', ['A', 'C++', 'van', '.']), + ('A C++-ben van.', ['A', 'C++-ben', 'van', '.']), + ('A C++.', ['A', 'C++', '.']), + ('A C++-ben.', ['A', 'C++-ben', '.']), + ('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']), + ('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']), + ('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']), + ('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']), + ('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']), + ('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']), + ('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']), + ('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']), + ('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']), + ('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']), + ('A IV. 12.', ['A', 'IV.', '12.']), + ('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']), + ('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']), + ('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']), + ('A 2003.01.06.', ['A', '2003.01.06.']), + ('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']), + ('A IV.12. van.', ['A', 'IV.12.', 'van', '.']), + ('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']), + ('A IV.12.', ['A', 'IV.12.']), + ('A IV.12-ben.', ['A', 'IV.12-ben', '.']), + ('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']), + ('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']), + ('A 1.1.2.', ['A', '1.1.2.']), + ('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']), + ('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']), + ('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']), + ('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']), + ('A 3,14 van.', ['A', '3,14', 'van', '.']), + ('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']), + ('A 3,14-ben.', ['A', '3,14-ben', '.']), + ('A 3.14 van.', ['A', '3.14', 'van', '.']), + ('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']), + ('A 3.14-ben.', ['A', '3.14-ben', '.']), + ('A 15. van.', ['A', '15.', 'van', '.']), + ('A 15-ben van.', ['A', '15-ben', 'van', '.']), + ('A 15-ben.', ['A', '15-ben', '.']), + ('A 15.-ben van.', ['A', '15.-ben', 'van', '.']), + ('A 15.-ben.', ['A', '15.-ben', '.']), + ('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']), + ('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']), + ('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']), + ('A -0,99% van.', ['A', '-0,99%', 'van', '.']), + ('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']), + ('A -0,99%.', ['A', '-0,99%', '.']), + ('A -0,99%-ben.', ['A', '-0,99%-ben', '.']), + ('A 10--20% van.', ['A', '10--20%', 'van', '.']), + ('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']), + ('A 10--20%.', ['A', '10--20%', '.']), + ('A 10--20%-ben.', ['A', '10--20%-ben', '.']), + ('A 99§ van.', ['A', '99§', 'van', '.']), + ('A 99§-ben van.', ['A', '99§-ben', 'van', '.']), + ('A 99§-ben.', ['A', '99§-ben', '.']), + ('A 10--20§ van.', ['A', '10--20§', 'van', '.']), + ('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']), + ('A 10--20§-ben.', ['A', '10--20§-ben', '.']), + ('A 99° van.', ['A', '99°', 'van', '.']), + ('A 99°-ben van.', ['A', '99°-ben', 'van', '.']), + ('A 99°-ben.', ['A', '99°-ben', '.']), + ('A 10--20° van.', ['A', '10--20°', 'van', '.']), + ('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']), + ('A 10--20°-ben.', ['A', '10--20°-ben', '.']), + ('A °C van.', ['A', '°C', 'van', '.']), + ('A °C-ben van.', ['A', '°C-ben', 'van', '.']), + ('A °C.', ['A', '°C', '.']), + ('A °C-ben.', ['A', '°C-ben', '.']), + ('A 100°C van.', ['A', '100°C', 'van', '.']), + ('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']), + ('A 100°C.', ['A', '100°C', '.']), + ('A 100°C-ben.', ['A', '100°C-ben', '.']), + ('A 800x600 van.', ['A', '800x600', 'van', '.']), + ('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']), + ('A 800x600-ben.', ['A', '800x600-ben', '.']), + ('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']), + ('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']), + ('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']), + ('A 5/J van.', ['A', '5/J', 'van', '.']), + ('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']), + ('A 5/J-ben.', ['A', '5/J-ben', '.']), + ('A 5/J. van.', ['A', '5/J.', 'van', '.']), + ('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']), + ('A 5/J.-ben.', ['A', '5/J.-ben', '.']), + ('A III/1 van.', ['A', 'III/1', 'van', '.']), + ('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']), + ('A III/1-ben.', ['A', 'III/1-ben', '.']), + ('A III/1. van.', ['A', 'III/1.', 'van', '.']), + ('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']), + ('A III/1.-ben.', ['A', 'III/1.-ben', '.']), + ('A III/c van.', ['A', 'III/c', 'van', '.']), + ('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']), + ('A III/c.', ['A', 'III/c', '.']), + ('A III/c-ben.', ['A', 'III/c-ben', '.']), + ('A TU–154 van.', ['A', 'TU–154', 'van', '.']), + ('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']), + ('A TU–154-ben.', ['A', 'TU–154-ben', '.'])] + +_QUOTE_TESTS = [('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), + ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), + ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']), + ('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']), + ("A don't van.", ['A', "don't", 'van', '.'])] + +_DOT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), + ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), + ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), + ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), + ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), + ('A .hu.', ['A', '.hu', '.']), + ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), + ('A pl.', ['A', 'pl.']), + ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), + ('Egy..ket.', ['Egy', '..', 'ket', '.']), + ('Valami... van.', ['Valami', '...', 'van', '.']), + ('Valami ...van...', ['Valami', '...', 'van', '...']), + ('Valami...', ['Valami', '...']), + ('Valami ...', ['Valami', '...']), + ('Valami ... más.', ['Valami', '...', 'más', '.'])] + + +@pytest.fixture(scope="session") +def HU(): + return Hungarian() + + +@pytest.fixture(scope="module") +def hu_tokenizer(HU): + return HU.tokenizer + + +@pytest.mark.parametrize(("input", "expected_tokens"), + _DEFAULT_TESTS + _HYPHEN_TESTS + _NUMBER_TESTS + _DOT_TESTS + _QUOTE_TESTS) +def test_testcases(hu_tokenizer, input, expected_tokens): + tokens = hu_tokenizer(input) + token_list = [token.orth_ for token in tokens if not token.is_space] + assert expected_tokens == token_list