mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Removed data files from tests..
This commit is contained in:
parent
3d5306acb9
commit
ab2f6ea46c
|
@ -1,4 +0,0 @@
|
|||
__author__ = 'gyorgyorosz'
|
||||
|
||||
if __name__ == "__main__":
|
||||
pass
|
|
@ -1,58 +0,0 @@
|
|||
# TOKEN dots
|
||||
|
||||
0. egyszeru szavak
|
||||
IN : N. kormányzósági
|
||||
IN : székhely.
|
||||
OUT: <s><w>N.</w><ws> </ws><w>kormányzósági</w><ws>
|
||||
OUT: </ws><w>székhely</w><c>.</c></s>
|
||||
|
||||
|
||||
1. szavak pontokkal
|
||||
|
||||
1.1 mondatkozi verziok
|
||||
1.1.1 pottal kezdodo szavak
|
||||
IN : A .hu egy tld.
|
||||
OUT: <s><w>A</w><ws> </ws><w>.hu</w><ws> </ws><w>egy</w><ws> </ws><w>tld</w><c>.</c></s>
|
||||
1.1.2 pont a szo belsejeben
|
||||
IN : Az egy.ketto pelda.
|
||||
OUT: <s><w>Az</w><ws> </ws><w>egy.ketto</w><ws> </ws><w>pelda</w><c>.</c></s>
|
||||
1.1.3 pont a szo vegen
|
||||
IN : A pl. rovidites.
|
||||
OUT: <s><w>A</w><ws> </ws><w>pl.</w><ws> </ws><w>rovidites</w><c>.</c></s>
|
||||
1.1.4 pontozott szo
|
||||
IN : A S.M.A.R.T. szo.
|
||||
OUT: <s><w>A</w><ws> </ws><w>S.M.A.R.T.</w><ws> </ws><w>szo</w><c>.</c></s>
|
||||
|
||||
1.2 mondatvegi verziok
|
||||
1.2.1 pottal kezdodo szavak
|
||||
IN : A .hu.
|
||||
OUT: <s><w>A</w><ws> </ws><w>.hu</w><c>.</c></s>
|
||||
1.2.2 pont a szo belsejeben
|
||||
IN : Az egy.ketto.
|
||||
OUT: <s><w>Az</w><ws> </ws><w>egy.ketto</w><c>.</c></s>
|
||||
1.2.3 pont a szo vegen
|
||||
#TODO: cf. Huntoken
|
||||
IN : A pl.
|
||||
OUT: <s><w>A</w><ws> </ws><w>pl.</w></s>
|
||||
1.2.4 pontozott szo
|
||||
#TODO: cf. Huntoken
|
||||
IN : A S.M.A.R.T.
|
||||
OUT: <s><w>A</w><ws> </ws><w>S.M.A.R.T.</w></s>
|
||||
|
||||
|
||||
2. tobb pont
|
||||
|
||||
2.1 ketto vagy tobb pont utan uj szo
|
||||
IN : Egy..ket.
|
||||
OUT: <s><w>Egy</w><c>..</c><w>ket</w><c>.</c></s>
|
||||
IN : Valami... van.
|
||||
OUT: <s><w>Valami</w><c>...</c><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : Valami ...van...
|
||||
OUT: <s><w>Valami</w><ws> </ws><c>...</c><w>van</w><c>...</c></s>
|
||||
IN : Valami...
|
||||
OUT: <s><w>Valami</w><c>...</c></s>
|
||||
IN : Valami ...
|
||||
OUT: <s><w>Valami</w><ws> </ws><c>...</c></s>
|
||||
IN : Valami ... más.
|
||||
OUT: <s><w>Valami</w><ws> </ws><c>...</c><ws> </ws><w>más</w><c>.</c></s>
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
# TOKEN hyphen
|
||||
|
||||
-nak, -nek es ehhez hasonlok
|
||||
IN : Egy -nak, -jaiért, -magyar, bel- van.
|
||||
OUT: <s><w>Egy</w><ws> </ws><w>-nak</w><c>,</c><ws> </ws><w>-jaiért</w><c>,</c><ws> </ws><w>-magyar</w><c>,</c><ws> </ws><w>bel-</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : Egy -nak.
|
||||
OUT: <s><w>Egy</w><ws> </ws><w>-nak</w><c>.</c></s>
|
||||
IN : Egy bel-.
|
||||
OUT: <s><w>Egy</w><ws> </ws><w>bel-</w><c>.</c></s>
|
||||
IN : Dinnye-domb-.
|
||||
OUT: <s><w>Dinnye-domb-</w><c>.</c></s>
|
||||
|
||||
kulonvalt '-e'
|
||||
IN : Ezen -e elcsatangolt.
|
||||
OUT: <s><w>Ezen</w><ws> </ws><w>-e</w><ws> </ws><w>elcsatangolt</w><c>.</c></s>
|
||||
|
||||
-e levagasa, zarojel nelkul
|
||||
IN : Lakik-e
|
||||
OUT: <s><w>Lakik</w><w>-e</w></s>
|
||||
IN : Lakik-e?
|
||||
OUT: <s><w>Lakik</w><w>-e</w><c>?</c></s>
|
||||
IN : Lakik-e.
|
||||
OUT: <s><w>Lakik</w><w>-e</w><c>.</c></s>
|
||||
IN : Lakik-e...
|
||||
OUT: <s><w>Lakik</w><w>-e</w><c>...</c></s>
|
||||
IN : Lakik-e... van.
|
||||
OUT: <s><w>Lakik</w><w>-e</w><c>...</c><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : Lakik-e van?
|
||||
OUT: <s><w>Lakik</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
|
||||
|
||||
# TODO: adapt spacy to handle such brackets
|
||||
zarojeles mondatkozi valtozatok
|
||||
#IN : (La)kik-e van?
|
||||
#OUT: <s><w>(La)kik</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
|
||||
#IN : L(a)kik-e van?
|
||||
#OUT: <s><w>L(a)kik</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
|
||||
#IN : Lak(ik)-e van?
|
||||
#OUT: <s><w>Lak(ik)</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
|
||||
|
||||
# TODO: adapt spacy to handle such brackets
|
||||
zarojeles mondatvegi valtozatok
|
||||
#IN : (La)kik-e.
|
||||
#OUT: <s><w>(La)kik</w><w>-e</w><c>.</c></s>
|
||||
#IN : L(a)kik-e.
|
||||
#OUT: <s><w>L(a)kik</w><w>-e</w><c>.</c></s>
|
||||
#IN : Lak(ik)-e.
|
||||
#OUT: <s><w>Lak(ik)</w><w>-e</w><c>.</c></s>
|
||||
|
||||
kontroll
|
||||
IN : Lakik-elem van?
|
||||
OUT: <s><w>Lakik-elem</w><ws> </ws><w>van</w><c>?</c></s>
|
||||
IN : Van lakik-elem.
|
||||
OUT: <s><w>Van</w><ws> </ws><w>lakik-elem</w><c>.</c></s>
|
||||
IN : A 7-es busz?
|
||||
OUT: <s><w>A</w><ws> </ws><w>7-es</w><ws> </ws><w>busz</w><c>?</c></s>
|
||||
IN : A 7-es?
|
||||
OUT: <s><w>A</w><ws> </ws><w>7-es</w><c>?</c></s>
|
||||
IN : A 7-es.
|
||||
OUT: <s><w>A</w><ws> </ws><w>7-es</w><c>.</c></s>
|
||||
|
||||
problemas eset, megengedjuk # TODO: works erroundously in HunToken, but OK in spacy
|
||||
IN : Ez (lakik)-e?
|
||||
OUT: <s><w>Ez</w><ws> </ws><c>(</c><w>lakik</w><c>)</c><w>-e</w><c>?</c></s>
|
||||
|
||||
TODO: macska-/kutyavilag
|
||||
IN : A macska-/kutyavilag van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>macska-</w><c>/</c><w>kutyavilag</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
|
||||
%-, §-
|
||||
# TODO: spaCy cannot handle such cases
|
||||
# IN : A §-sal.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>§-sal</w><c>.</c></s>
|
||||
IN : A %-sal.
|
||||
OUT: <s><w>A</w><ws> </ws><w>%-sal</w><c>.</c></s>
|
||||
|
||||
tobb kotojel
|
||||
IN : A CD-ROM-okrol.
|
||||
OUT: <s><w>A</w><ws> </ws><w>CD-ROM-okrol</w><c>.</c></s>
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,233 +0,0 @@
|
|||
# TOKEN it
|
||||
|
||||
tartomanynevek
|
||||
IN : .edu, .hu.
|
||||
OUT: <s><w>.edu</w><c>,</c><ws> </ws><w>.hu</w><c>.</c></s>
|
||||
|
||||
url-ek
|
||||
IN : red-stars.com.
|
||||
OUT: <s><w>red-stars.com</w><c>.</c></s>
|
||||
IN : www.valami.com.
|
||||
OUT: <s><w>www.valami.com</w><c>.</c></s>
|
||||
|
||||
url-ek toldalekolva
|
||||
IN : www.valami.com-ról.
|
||||
OUT: <s><w>www.valami.com-ról</w><c>.</c></s>
|
||||
IN : www.valami.comról.
|
||||
OUT: <s><w>www.valami.comról</w><c>.</c></s>
|
||||
|
||||
osszetettebb url-ek
|
||||
IN : A https://www.valami.com/index.html van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>https://www.valami.com/index.html</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A http://142.42.1.1/ van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://142.42.1.1/</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A http://142.42.1.1/.
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://142.42.1.1/</w><c>.</c></s>
|
||||
IN : A http://example.com/.
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://example.com/</w><c>.</c></s>
|
||||
IN : A http://example.com/ van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://example.com/</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A http://foo.com/blah_(wikipedia)#cite-1.
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://foo.com/blah_(wikipedia)#cite-1</w><c>.</c></s>
|
||||
IN : A http://foo.com/blah_(wikipedia)#cite-1 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://foo.com/blah_(wikipedia)#cite-1</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A http://foo.com/blah_(wikipedia).
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://foo.com/blah_(wikipedia)</w><c>.</c></s>
|
||||
IN : A http://foo.com/blah_(wikipedia) van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://foo.com/blah_(wikipedia)</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A http://www.example.com/wpstyle/?bar=baz&inga=42&quux.
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://www.example.com/wpstyle/?bar=baz&inga=42&quux</w><c>.</c></s>
|
||||
IN : A http://www.example.com/wpstyle/?bar=baz&inga=42&quux van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://www.example.com/wpstyle/?bar=baz&inga=42&quux</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A http://example.com:8080.
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://example.com:8080</w><c>.</c></s>
|
||||
IN : A http://example.com:8080 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://example.com:8080</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A http://foo.bar/?q=Test%20URL-encoded%20stuff.
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://foo.bar/?q=Test%20URL-encoded%20stuff</w><c>.</c></s>
|
||||
IN : A http://foo.bar/?q=Test%20URL-encoded%20stuff van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>http://foo.bar/?q=Test%20URL-encoded%20stuff</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# url with more '='
|
||||
IN : A www.kereso.elte.hu/nev=kiss,jozsef%kar=jog%tagozat=nappali.
|
||||
OUT: <s><w>A</w><ws> </ws><w>www.kereso.elte.hu/nev=kiss,jozsef%kar=jog%tagozat=nappali</w><c>.</c></s>
|
||||
IN : A www.kereso.elte.hu/nev=kiss,jozsef%kar=jog%tagozat=nappali van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>www.kereso.elte.hu/nev=kiss,jozsef%kar=jog%tagozat=nappali</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
|
||||
email
|
||||
IN : A foo.bar@baz.com van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>foo.bar@baz.com</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A foo.bar@baz.com.
|
||||
OUT: <s><w>A</w><ws> </ws><w>foo.bar@baz.com</w><c>.</c></s>
|
||||
IN : A foo.bar@baz.com-nak van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>foo.bar@baz.com-nak</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A foo.bar@baz.com-nak.
|
||||
OUT: <s><w>A</w><ws> </ws><w>foo.bar@baz.com-nak</w><c>.</c></s>
|
||||
IN : A foo.bar@baz.comnak van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>foo.bar@baz.comnak</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A foo.bar@baz.comnak.
|
||||
OUT: <s><w>A</w><ws> </ws><w>foo.bar@baz.comnak</w><c>.</c></s>
|
||||
IN : A mailto:foo.bar@baz.com van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>mailto:foo.bar@baz.com</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A mailto:foo.bar@baz.com.
|
||||
OUT: <s><w>A</w><ws> </ws><w>mailto:foo.bar@baz.com</w><c>.</c></s>
|
||||
IN : A mailto:foo.bar@baz.com-nak van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>mailto:foo.bar@baz.com-nak</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A mailto:foo.bar@baz.com-nak.
|
||||
OUT: <s><w>A</w><ws> </ws><w>mailto:foo.bar@baz.com-nak</w><c>.</c></s>
|
||||
IN : A mailto:foo.bar@baz.comnak van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>mailto:foo.bar@baz.comnak</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A mailto:foo.bar@baz.comnak.
|
||||
OUT: <s><w>A</w><ws> </ws><w>mailto:foo.bar@baz.comnak</w><c>.</c></s>
|
||||
|
||||
windows halozati szolgaltatas (vagy mi)
|
||||
IN : Az ADMIN$ van.
|
||||
OUT: <s><w>Az</w><ws> </ws><w>ADMIN$</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : Az ADMIN$.
|
||||
OUT: <s><w>Az</w><ws> </ws><w>ADMIN$</w><c>.</c></s>
|
||||
IN : Az ADMIN$-nak van.
|
||||
OUT: <s><w>Az</w><ws> </ws><w>ADMIN$-nak</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : Az ADMIN$-nak.
|
||||
OUT: <s><w>Az</w><ws> </ws><w>ADMIN$-nak</w><c>.</c></s>
|
||||
|
||||
legfelso tartomanynevek
|
||||
IN : A .edu, .hu vannak.
|
||||
OUT: <s><w>A</w><ws> </ws><w>.edu</w><c>,</c><ws> </ws><w>.hu</w><ws> </ws><w>vannak</w><c>.</c></s>
|
||||
IN : A .edu, .hu.
|
||||
OUT: <s><w>A</w><ws> </ws><w>.edu</w><c>,</c><ws> </ws><w>.hu</w><c>.</c></s>
|
||||
|
||||
slash, backslash
|
||||
IN : A / van.
|
||||
OUT: <s><w>A</w><ws> </ws><c>/</c><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A \ van.
|
||||
OUT: <s><w>A</w><ws> </ws><c>\</c><ws> </ws><w>van</w><c>.</c></s>
|
||||
|
||||
windows path
|
||||
IN : C:, \ es c:\ van.
|
||||
OUT: <s><w>C:</w><c>,</c><ws> </ws><c>\</c><ws> </ws><w>es</w><ws> </ws><w>c:\</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A \\test\test$\TEST.xls van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>\\test\test$\TEST.xls</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A \\server\share\folder\myfile.txt van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>\\server\share\folder\myfile.txt</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A \\server\share\myfile.txt van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>\\server\share\myfile.txt</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A \\123.123.123.123\share\folder\myfile.txt van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>\\123.123.123.123\share\folder\myfile.txt</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A c:\folder\myfile.txt van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>c:\folder\myfile.txt</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A c:\folder\myfile.txt-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>c:\folder\myfile.txt-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A c:\folder\myfileWithoutExtension van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>c:\folder\myfileWithoutExtension</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A c:\folder\myfile.txt.
|
||||
OUT: <s><w>A</w><ws> </ws><w>c:\folder\myfile.txt</w><c>.</c></s>
|
||||
IN : A c:\folder\myfile.txt-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>c:\folder\myfile.txt-ben</w><c>.</c></s>
|
||||
|
||||
unix path
|
||||
IN : A ./ van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>./</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A ../ van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>../</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A ./valami1/valami2/ van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>./valami1/valami2/</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A /valami/valami.txt van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>/valami/valami.txt</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A /valami/valami.txt-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>/valami/valami.txt-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A /valami/valami.txt.
|
||||
OUT: <s><w>A</w><ws> </ws><w>/valami/valami.txt</w><c>.</c></s>
|
||||
IN : A /valami/valami.txt-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>/valami/valami.txt-ben</w><c>.</c></s>
|
||||
|
||||
Huntokenes tesztek vegyesen
|
||||
1.
|
||||
IN : A C:\DINNYE\MACSKA.JPG van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>C:\DINNYE\MACSKA.JPG</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A C:\DINNYE\MACSKA.JPG-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>C:\DINNYE\MACSKA.JPG-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A C:\DINNYE\MACSKA.JPG.
|
||||
OUT: <s><w>A</w><ws> </ws><w>C:\DINNYE\MACSKA.JPG</w><c>.</c></s>
|
||||
IN : A C:\DINNYE\MACSKA.JPG-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>C:\DINNYE\MACSKA.JPG-ben</w><c>.</c></s>
|
||||
2.
|
||||
IN : A \\SZERVER\SZOLGALTATAS$ van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>\\SZERVER\SZOLGALTATAS$</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A \\SZERVER\SZOLGALTATAS$-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>\\SZERVER\SZOLGALTATAS$-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A \\SZERVER\SZOLGALTATAS$.
|
||||
OUT: <s><w>A</w><ws> </ws><w>\\SZERVER\SZOLGALTATAS$</w><c>.</c></s>
|
||||
IN : A \\SZERVER\SZOLGALTATAS$-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>\\SZERVER\SZOLGALTATAS$-ben</w><c>.</c></s>
|
||||
3.
|
||||
IN : A /etc/.././home/ van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>/etc/.././home/</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A /etc/.././home/-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>/etc/.././home/-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A /etc/.././home/.
|
||||
OUT: <s><w>A</w><ws> </ws><w>/etc/.././home/</w><c>.</c></s>
|
||||
IN : A /etc/.././home/-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>/etc/.././home/-ben</w><c>.</c></s>
|
||||
4.
|
||||
IN : A *.doc van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>*.doc</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A *.doc-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>*.doc-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A *.doc.
|
||||
OUT: <s><w>A</w><ws> </ws><w>*.doc</w><c>.</c></s>
|
||||
IN : A *.doc-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>*.doc-ben</w><c>.</c></s>
|
||||
5.
|
||||
IN : A *.* van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>*.*</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A *.*-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>*.*-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A *.*.
|
||||
OUT: <s><w>A</w><ws> </ws><w>*.*</w><c>.</c></s>
|
||||
IN : A *.*-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>*.*-ben</w><c>.</c></s>
|
||||
6.
|
||||
IN : A .bmp.zip van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>.bmp.zip</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A .bmp.zip-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>.bmp.zip-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A .bmp.zip.
|
||||
OUT: <s><w>A</w><ws> </ws><w>.bmp.zip</w><c>.</c></s>
|
||||
IN : A .bmp.zip-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>.bmp.zip-ben</w><c>.</c></s>
|
||||
|
||||
|
||||
fajlnevek
|
||||
IN : A teszt.txt van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>teszt.txt</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A teszt.txt-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>teszt.txt-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A teszt.txt.
|
||||
OUT: <s><w>A</w><ws> </ws><w>teszt.txt</w><c>.</c></s>
|
||||
IN : A teszt.txt-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>teszt.txt-ben</w><c>.</c></s>
|
||||
IN : A .txt van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>.txt</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
|
||||
|
||||
MINTA
|
||||
IN : A van.
|
||||
OUT: <s><w>A</w><ws> </ws><w></w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A -ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A .
|
||||
OUT: <s><w>A</w><ws> </ws><w></w><c>.</c></s>
|
||||
IN : A -ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>-ben</w><c>.</c></s>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,22 +0,0 @@
|
|||
# TOKEN misc
|
||||
|
||||
TODO: html entities
|
||||
IN : Molière-rol van.
|
||||
OUT: <s><w>Molière-rol</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
|
||||
|
||||
&-t tartalmazo nagybatus szavak
|
||||
IN : AT&T van.
|
||||
OUT: <s><w>AT&T</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
|
||||
|
||||
zarojeles mondatkozi valtozatok
|
||||
IN : (La)kik-e van?
|
||||
OUT: <s><w>(La)kik</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
|
||||
IN : L(a)kik-e van?
|
||||
OUT: <s><w>L(a)kik</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
|
||||
IN : Lak(ik)-e van?
|
||||
OUT: <s><w>Lak(ik)</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
|
||||
|
||||
|
||||
|
|
@ -1,613 +0,0 @@
|
|||
# TOKEN numbers
|
||||
|
||||
szam es betu
|
||||
IN : A 2b van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2b</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 2b-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2b-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 2b.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2b</w><c>.</c></s>
|
||||
IN : A 2b-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2b-ben</w><c>.</c></s>
|
||||
IN : A 3.b van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>3.b</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 3.b-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>3.b-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 3.b.
|
||||
OUT: <s><w>A</w><ws> </ws><w>3.b</w><c>.</c></s>
|
||||
IN : A 3.b-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>3.b-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 1:20:36.7 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1:20:36.7</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 1:20:36.7-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1:20:36.7-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# TODO: not supported by spacy
|
||||
# IN : A 1:20:36.7.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>1:20:36.7</w><c>.</c></s>
|
||||
IN : A 1:20:36.7-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1:20:36.7-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 1:35 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1:35</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 1:35-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1:35-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# TODO: not supported by spacy
|
||||
# IN : A 1:35.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>1:35</w><c>.</c></s>
|
||||
IN : A 1:35-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1:35-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 1.35 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1.35</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 1.35-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1.35-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# TODO: not supported by spacy
|
||||
# IN : A 1.35.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>1.35</w><c>.</c></s>
|
||||
IN : A 1.35-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1.35-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 4:01,95 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>4:01,95</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 4:01,95-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>4:01,95-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 4:01,95.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>4:01,95</w><c>.</c></s>
|
||||
IN : A 4:01,95-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>4:01,95-ben</w><c>.</c></s>
|
||||
|
||||
DASH
|
||||
hyphen minus
|
||||
IN : A 10--12 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10--12</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 10--12-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10--12-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 10--12.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>10--12</w><c>.</c></s>
|
||||
IN : A 10--12-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10--12-ben</w><c>.</c></s>
|
||||
hyphen
|
||||
IN : A 10‐12 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10‐12</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 10‐12-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10‐12-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 10‐12.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>10‐12</w><c>.</c></s>
|
||||
IN : A 10‐12-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10‐12-ben</w><c>.</c></s>
|
||||
non-breaking hyphen
|
||||
IN : A 10‑12 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10‑12</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 10‑12-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10‑12-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 10‑12.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>10‑12</w><c>.</c></s>
|
||||
IN : A 10‑12-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10‑12-ben</w><c>.</c></s>
|
||||
figure dash
|
||||
IN : A 10‒12 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10‒12</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 10‒12-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10‒12-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 10‒12.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>10‒12</w><c>.</c></s>
|
||||
IN : A 10‒12-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10‒12-ben</w><c>.</c></s>
|
||||
en dash
|
||||
IN : A 10–12 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10–12</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 10–12-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10–12-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 10–12.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>10–12</w><c>.</c></s>
|
||||
IN : A 10–12-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10–12-ben</w><c>.</c></s>
|
||||
em dash
|
||||
IN : A 10—12 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10—12</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 10—12-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10—12-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 10—12.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>10—12</w><c>.</c></s>
|
||||
IN : A 10—12-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10—12-ben</w><c>.</c></s>
|
||||
horizontal bar
|
||||
IN : A 10―12 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10―12</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 10―12-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10―12-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 10―12.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>10―12</w><c>.</c></s>
|
||||
IN : A 10―12-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10―12-ben</w><c>.</c></s>
|
||||
|
||||
tagolt szamok
|
||||
IN : A -23,12 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>-23,12</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A -23,12-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>-23,12-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A -23,12.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>-23,12</w><c>.</c></s>
|
||||
IN : A -23,12-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>-23,12-ben</w><c>.</c></s>
|
||||
|
||||
#TODO: not supported transformations in spaCy
|
||||
# IN : A +12 500 van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>+12 500</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A +12 500-ben van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>+12 500-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A +12 500.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>+12 500</w><c>.</c></s>
|
||||
# IN : A +12 500-ben.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>+12 500-ben</w><c>.</c></s>
|
||||
|
||||
# IN : A +12 500,99 van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>+12 500,99</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A +12 500,99-ben van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>+12 500,99-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A +12 500,99.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>+12 500,99</w><c>.</c></s>
|
||||
# IN : A +12 500,99-ben.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>+12 500,99-ben</w><c>.</c></s>
|
||||
|
||||
# IN : A +12.500.000 van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>+12.500.000</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A +12.500.000-ben van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>+12.500.000-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A +12.500.000.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>+12.500.000</w><c>.</c></s>
|
||||
# IN : A +12.500.000-ben.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>+12.500.000-ben</w><c>.</c></s>
|
||||
|
||||
A muveletek (+, -, *, /) jelentette kihivasok:
|
||||
IN : A 2+3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><c>+</c><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2+3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><c>+</c><w>3</w><c>.</c></s>
|
||||
IN : A 2 +3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>+</c><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2 +3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>+</c><w>3</w><c>.</c></s>
|
||||
IN : A 2+ 3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><c>+</c><ws> </ws><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2+ 3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><c>+</c><ws> </ws><w>3</w><c>.</c></s>
|
||||
IN : A 2 + 3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>+</c><ws> </ws><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2 + 3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>+</c><ws> </ws><w>3</w><c>.</c></s>
|
||||
|
||||
IN : A 2-3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><c>-</c><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2-3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><c>-</c><w>3</w><c>.</c></s>
|
||||
IN : A 2 -3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>-</c><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2 -3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>-</c><w>3</w><c>.</c></s>
|
||||
IN : A 2- 3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><c>-</c><ws> </ws><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2- 3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><c>-</c><ws> </ws><w>3</w><c>.</c></s>
|
||||
IN : A 2 - 3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>-</c><ws> </ws><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2 - 3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>-</c><ws> </ws><w>3</w><c>.</c></s>
|
||||
|
||||
IN : A 2*3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><c>*</c><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2*3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><c>*</c><w>3</w><c>.</c></s>
|
||||
IN : A 2 *3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>*</c><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2 *3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>*</c><w>3</w><c>.</c></s>
|
||||
IN : A 2* 3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><c>*</c><ws> </ws><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2* 3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><c>*</c><ws> </ws><w>3</w><c>.</c></s>
|
||||
IN : A 2 * 3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>*</c><ws> </ws><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2 * 3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>*</c><ws> </ws><w>3</w><c>.</c></s>
|
||||
|
||||
IN : A 2/3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><c>/</c><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2/3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><c>/</c><w>3</w><c>.</c></s>
|
||||
IN : A 2 /3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>/</c><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2 /3.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>/</c><w>3</w><c>.</c></s>
|
||||
IN : A 2/ 3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><c>/</c><ws> </ws><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 2/ 3.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><c>/</c><ws> </ws><w>3</w><c>.</c></s>
|
||||
IN : A 2 / 3 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>/</c><ws> </ws><w>3</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 2 / 3.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2</w><ws> </ws><c>/</c><ws> </ws><w>3</w><c>.</c></s>
|
||||
|
||||
IN : A C++ van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>C++</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A C++-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>C++-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A C++.
|
||||
OUT: <s><w>A</w><ws> </ws><w>C++</w><c>.</c></s>
|
||||
IN : A C++-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>C++-ben</w><c>.</c></s>
|
||||
|
||||
|
||||
|
||||
datumok
|
||||
IN : A 2003. I. 06. van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2003.</w><ws> </ws><w>I.</w><ws> </ws><w>06.</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 2003. I. 06-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2003.</w><ws> </ws><w>I.</w><ws> </ws><w>06-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 2003. I. 06.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2003.</w><ws> </ws><w>I.</w><ws> </ws><w>06.</w></s>
|
||||
IN : A 2003. I. 06-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2003.</w><ws> </ws><w>I.</w><ws> </ws><w>06-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 2003. 01. 06. van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2003.</w><ws> </ws><w>01.</w><ws> </ws><w>06.</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 2003. 01. 06-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2003.</w><ws> </ws><w>01.</w><ws> </ws><w>06-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 2003. 01. 06.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2003.</w><ws> </ws><w>01.</w><ws> </ws><w>06.</w></s>
|
||||
IN : A 2003. 01. 06-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2003.</w><ws> </ws><w>01.</w><ws> </ws><w>06-ben</w><c>.</c></s>
|
||||
|
||||
IN : A IV. 12. van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>IV.</w><ws> </ws><w>12.</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A IV. 12-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>IV.</w><ws> </ws><w>12-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A IV. 12.
|
||||
OUT: <s><w>A</w><ws> </ws><w>IV.</w><ws> </ws><w>12.</w></s>
|
||||
IN : A IV. 12-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>IV.</w><ws> </ws><w>12-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 2003.01.06. van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2003.01.06.</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 2003.01.06-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2003.01.06-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 2003.01.06.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2003.01.06.</w></s>
|
||||
IN : A 2003.01.06-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2003.01.06-ben</w><c>.</c></s>
|
||||
|
||||
# TODO: cannot implement in spacy
|
||||
# IN : A 2003/01/06 van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2003/01/06</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2003/01/06-ben van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2003/01/06-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2003/01/06.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2003/01/06</w><c>.</c></s>
|
||||
# IN : A 2003/01/06-ben.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2003/01/06-ben</w><c>.</c></s>
|
||||
|
||||
IN : A IV.12. van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>IV.12.</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A IV.12-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>IV.12-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A IV.12.
|
||||
OUT: <s><w>A</w><ws> </ws><w>IV.12.</w></s>
|
||||
IN : A IV.12-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>IV.12-ben</w><c>.</c></s>
|
||||
|
||||
fejezetazonosito
|
||||
IN : A 1.1.2. van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1.1.2.</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 1.1.2-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1.1.2-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 1.1.2.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1.1.2.</w></s>
|
||||
IN : A 1.1.2-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1.1.2-ben</w><c>.</c></s>
|
||||
|
||||
# TODO cannot handle "/" in multiple ways in spacy
|
||||
egyeb pontot tartalmazo szamok
|
||||
# IN : A 123.45/67. van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>123.45/67.</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 123.45/67-ben van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>123.45/67-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 123.45/67.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>123.45/67</w><c>.</c></s>
|
||||
# IN : A 123.45/67-ben.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>123.45/67-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 1,5--2,5 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1,5--2,5</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 1,5--2,5-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1,5--2,5-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 1,5--2,5.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>1,5--2,5</w><c>.</c></s>
|
||||
IN : A 1,5--2,5-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1,5--2,5-ben</w><c>.</c></s>
|
||||
|
||||
tizedestortek vesszovel
|
||||
IN : A 3,14 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>3,14</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 3,14-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>3,14-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 3,14.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>3,14</w><c>.</c></s>
|
||||
IN : A 3,14-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>3,14-ben</w><c>.</c></s>
|
||||
|
||||
tizedestortek ponttal
|
||||
IN : A 3.14 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>3.14</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 3.14-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>3.14-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 3.14.
|
||||
# sOUT: <s><w>A</w><ws> </ws><w>3.14</w><c>.</c></s>
|
||||
IN : A 3.14-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>3.14-ben</w><c>.</c></s>
|
||||
|
||||
sorszamnevek
|
||||
IN : A 15. van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>15.</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 15-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>15-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 15.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>15</w><c>.</c></s>
|
||||
IN : A 15-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>15-ben</w><c>.</c></s>
|
||||
IN : A 15.-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>15.-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 15.-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>15.-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 2002--2003. van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2002--2003.</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 2002--2003-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2002--2003-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 2002--2003.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>2002--2003</w><c>.</c></s>
|
||||
IN : A 2002--2003-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>2002--2003-ben</w><c>.</c></s>
|
||||
|
||||
%-ot tartalmazo szamok
|
||||
IN : A -0,99% van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>-0,99%</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A -0,99%-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>-0,99%-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A -0,99%.
|
||||
OUT: <s><w>A</w><ws> </ws><w>-0,99%</w><c>.</c></s>
|
||||
IN : A -0,99%-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>-0,99%-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 10--20% van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10--20%</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 10--20%-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10--20%-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 10--20%.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10--20%</w><c>.</c></s>
|
||||
IN : A 10--20%-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10--20%-ben</w><c>.</c></s>
|
||||
|
||||
§-t tartalmazo szamok
|
||||
IN : A 99§ van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>99§</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 99§-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>99§-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 99§.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>99§</w><c>.</c></s>
|
||||
IN : A 99§-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>99§-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 10--20§ van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10--20§</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 10--20§-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10--20§-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 10--20§.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>10--20§</w><c>.</c></s>
|
||||
IN : A 10--20§-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10--20§-ben</w><c>.</c></s>
|
||||
|
||||
°-t tartalmazo szamok
|
||||
IN : A 99° van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>99°</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 99°-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>99°-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 99°.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>99°</w><c>.</c></s>
|
||||
IN : A 99°-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>99°-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 10--20° van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10--20°</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 10--20°-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10--20°-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A 10--20°.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>10--20°</w><c>.</c></s>
|
||||
IN : A 10--20°-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>10--20°-ben</w><c>.</c></s>
|
||||
|
||||
(ez nem szam, de ide jon)
|
||||
IN : A °C van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>°C</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A °C-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>°C-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A °C.
|
||||
OUT: <s><w>A</w><ws> </ws><w>°C</w><c>.</c></s>
|
||||
IN : A °C-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>°C-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 100°C van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>100°C</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 100°C-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>100°C-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 100°C.
|
||||
OUT: <s><w>A</w><ws> </ws><w>100°C</w><c>.</c></s>
|
||||
IN : A 100°C-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>100°C-ben</w><c>.</c></s>
|
||||
|
||||
euroz, dollaz, yeniz
|
||||
# IN : A $1,000 van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>$1,000</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A $1,000-ben van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>$1,000-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A $1,000.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>$1,000</w><c>.</c></s>
|
||||
# IN : A $1,000-ben.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>$1,000-ben</w><c>.</c></s>
|
||||
|
||||
# IN : A €1,000 van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>€1,000</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
I# N : A €1,000-ben van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>€1,000-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A €1,000.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>€1,000</w><c>.</c></s>
|
||||
# IN : A €1,000-ben.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>€1,000-ben</w><c>.</c></s>
|
||||
|
||||
# IN : A ¥1,000 van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>¥1,000</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A ¥1,000-ben van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>¥1,000-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A ¥1,000.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>¥1,000</w><c>.</c></s>
|
||||
# IN : A ¥1,000-ben.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>¥1,000-ben</w><c>.</c></s>
|
||||
|
||||
# IN : A £1,000 van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>£1,000</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A £1,000-ben van.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>£1,000-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
# IN : A £1,000.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>£1,000</w><c>.</c></s>
|
||||
# IN : A £1,000-ben.
|
||||
# OUT: <s><w>A</w><ws> </ws><w>£1,000-ben</w><c>.</c></s>
|
||||
|
||||
#aranyok
|
||||
#IN : A 1645kJ/1000g van.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>1645kJ/1000g</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A 1645kJ/1000g-ben van.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>1645kJ/1000g-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A 1645kJ/1000g.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>1645kJ/1000g</w><c>.</c></s>
|
||||
#IN : A 1645kJ/1000g-ben.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>1645kJ/1000g-ben</w><c>.</c></s>
|
||||
|
||||
#IN : A 1645 kJ/1000g van.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>1645</w><ws> </ws><w>kJ/1000g</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A 1645 kJ/1000g-ben van.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>1645</w><ws> </ws><w>kJ/1000g-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A 1645 kJ/1000g.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>1645</w><ws> </ws><w>kJ/1000g</w><c>.</c></s>
|
||||
#IN : A 1645 kJ/1000g-ben.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>1645</w><ws> </ws><w>kJ/1000g-ben</w><c>.</c></s>
|
||||
|
||||
meretek
|
||||
IN : A 800x600 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>800x600</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 800x600-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>800x600-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A 800x600.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>800x600</w><c>.</c></s>
|
||||
IN : A 800x600-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>800x600-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 1x2x3x4 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1x2x3x4</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 1x2x3x4-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1x2x3x4-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A 1x2x3x4.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>1x2x3x4</w><c>.</c></s>
|
||||
IN : A 1x2x3x4-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>1x2x3x4-ben</w><c>.</c></s>
|
||||
|
||||
tortszamok
|
||||
#IN : A 5/4 van.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>5/4</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A 5/4-ben van.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>5/4-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A 5/4.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>5/4</w><c>.</c></s>
|
||||
#IN : A 5/4-ben.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>5/4-ben</w><c>.</c></s>
|
||||
|
||||
#IN : A 5/4. van.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>5/4.</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A 5/4.-ben van.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>5/4.-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A 5/4.-ben.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>5/4.-ben</w><c>.</c></s>
|
||||
|
||||
egyeb szamok perjellel
|
||||
IN : A 5/J van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>5/J</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 5/J-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>5/J-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A 5/J.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>5/J</w><c>.</c></s>
|
||||
IN : A 5/J-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>5/J-ben</w><c>.</c></s>
|
||||
|
||||
IN : A 5/J. van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>5/J.</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 5/J.-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>5/J.-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A 5/J.-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>5/J.-ben</w><c>.</c></s>
|
||||
|
||||
romai szamok perjellel
|
||||
IN : A III/1 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>III/1</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A III/1-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>III/1-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A III/1.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>III/1</w><c>.</c></s>
|
||||
IN : A III/1-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>III/1-ben</w><c>.</c></s>
|
||||
|
||||
IN : A III/1. van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>III/1.</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A III/1.-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>III/1.-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A III/1.-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>III/1.-ben</w><c>.</c></s>
|
||||
|
||||
IN : A III/c van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>III/c</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A III/c-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>III/c-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A III/c.
|
||||
OUT: <s><w>A</w><ws> </ws><w>III/c</w><c>.</c></s>
|
||||
IN : A III/c-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>III/c-ben</w><c>.</c></s>
|
||||
|
||||
egyeb azonositok
|
||||
IN : A TU–154 van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>TU–154</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A TU–154-ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>TU–154-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A TU–154.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>TU–154</w><c>.</c></s>
|
||||
IN : A TU–154-ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>TU–154-ben</w><c>.</c></s>
|
||||
|
||||
szamok es nagybetuk legalabb ket perjellel
|
||||
#IN : A BDE/2000/01/0983/0010 van.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>BDE/2000/01/0983/0010</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A BDE/2000/01/0983/0010-ben van.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>BDE/2000/01/0983/0010-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
#IN : A BDE/2000/01/0983/0010.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>BDE/2000/01/0983/0010</w><c>.</c></s>
|
||||
#IN : A BDE/2000/01/0983/0010-ben.
|
||||
#OUT: <s><w>A</w><ws> </ws><w>BDE/2000/01/0983/0010-ben</w><c>.</c></s>
|
||||
|
||||
|
||||
MINTA
|
||||
IN : A van.
|
||||
OUT: <s><w>A</w><ws> </ws><w></w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A -ben van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>-ben</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : A .
|
||||
OUT: <s><w>A</w><ws> </ws><w></w><c>.</c></s>
|
||||
IN : A -ben.
|
||||
OUT: <s><w>A</w><ws> </ws><w>-ben</w><c>.</c></s>
|
||||
|
|
@ -1,22 +0,0 @@
|
|||
# TOKEN quote
|
||||
|
||||
mondatban
|
||||
IN : Az "Ime, hat"-ban irja.
|
||||
OUT: <s><w>Az</w><ws> </ws><c>"</c><w>Ime</w><c>,</c><ws> </ws><w>hat</w><c>"</c><w>-ban</w><ws> </ws><w>irja</w><c>.</c></s>
|
||||
|
||||
mondat elejen
|
||||
IN : "Ime, hat"-ban irja.
|
||||
OUT: <s><c>"</c><w>Ime</w><c>,</c><ws> </ws><w>hat</w><c>"</c><w>-ban</w><ws> </ws><w>irja</w><c>.</c></s>
|
||||
|
||||
mondat vegen
|
||||
IN : Az "Ime, hat".
|
||||
OUT: <s><w>Az</w><ws> </ws><c>"</c><w>Ime</w><c>,</c><ws> </ws><w>hat</w><c>"</c><c>.</c></s>
|
||||
|
||||
magaban
|
||||
IN : Egy 24"-os monitor.
|
||||
OUT: <s><w>Egy</w><ws> </ws><w>24</w><c>"</c><w>-os</w><ws> </ws><w>monitor</w><c>.</c></s>
|
||||
|
||||
aposztrof
|
||||
IN : A don't van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>don't</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
|
|
@ -1,68 +1,216 @@
|
|||
import os
|
||||
import re
|
||||
|
||||
import pytest
|
||||
|
||||
from spacy.hu import Hungarian
|
||||
|
||||
_MODULE_PATH = os.path.dirname(__file__)
|
||||
_DEFAULT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
||||
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
||||
('A .hu.', ['A', '.hu', '.']),
|
||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||
('A pl.', ['A', 'pl.']),
|
||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||
('Egy..ket.', ['Egy', '..', 'ket', '.']),
|
||||
('Valami... van.', ['Valami', '...', 'van', '.']),
|
||||
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
||||
('Valami...', ['Valami', '...']),
|
||||
('Valami ...', ['Valami', '...']),
|
||||
('Valami ... más.', ['Valami', '...', 'más', '.'])]
|
||||
|
||||
_HYPHEN_TESTS = [
|
||||
('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']),
|
||||
('Egy -nak.', ['Egy', '-nak', '.']),
|
||||
('Egy bel-.', ['Egy', 'bel-', '.']),
|
||||
('Dinnye-domb-.', ['Dinnye-domb-', '.']),
|
||||
('Ezen -e elcsatangolt.', ['Ezen', '-e', 'elcsatangolt', '.']),
|
||||
('Lakik-e', ['Lakik', '-e']),
|
||||
('Lakik-e?', ['Lakik', '-e', '?']),
|
||||
('Lakik-e.', ['Lakik', '-e', '.']),
|
||||
('Lakik-e...', ['Lakik', '-e', '...']),
|
||||
('Lakik-e... van.', ['Lakik', '-e', '...', 'van', '.']),
|
||||
('Lakik-e van?', ['Lakik', '-e', 'van', '?']),
|
||||
('Lakik-elem van?', ['Lakik-elem', 'van', '?']),
|
||||
('Van lakik-elem.', ['Van', 'lakik-elem', '.']),
|
||||
('A 7-es busz?', ['A', '7-es', 'busz', '?']),
|
||||
('A 7-es?', ['A', '7-es', '?']),
|
||||
('A 7-es.', ['A', '7-es', '.']),
|
||||
('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']),
|
||||
('A %-sal.', ['A', '%-sal', '.']),
|
||||
('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])]
|
||||
|
||||
class TokenizerTestCase(object):
|
||||
INPUT_PREFIX = "IN :"
|
||||
OUTPUT_PREFIX = "OUT:"
|
||||
WORD_PATTERN = re.compile(r"<([wc])>([^<>]+)</\1>")
|
||||
_NUMBER_TESTS = [('A 2b van.', ['A', '2b', 'van', '.']),
|
||||
('A 2b-ben van.', ['A', '2b-ben', 'van', '.']),
|
||||
('A 2b.', ['A', '2b', '.']),
|
||||
('A 2b-ben.', ['A', '2b-ben', '.']),
|
||||
('A 3.b van.', ['A', '3.b', 'van', '.']),
|
||||
('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']),
|
||||
('A 3.b.', ['A', '3.b', '.']),
|
||||
('A 3.b-ben.', ['A', '3.b-ben', '.']),
|
||||
('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']),
|
||||
('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']),
|
||||
('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']),
|
||||
('A 1:35 van.', ['A', '1:35', 'van', '.']),
|
||||
('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']),
|
||||
('A 1:35-ben.', ['A', '1:35-ben', '.']),
|
||||
('A 1.35 van.', ['A', '1.35', 'van', '.']),
|
||||
('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']),
|
||||
('A 1.35-ben.', ['A', '1.35-ben', '.']),
|
||||
('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']),
|
||||
('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']),
|
||||
('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']),
|
||||
('A 10--12 van.', ['A', '10--12', 'van', '.']),
|
||||
('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']),
|
||||
('A 10--12-ben.', ['A', '10--12-ben', '.']),
|
||||
('A 10‐12 van.', ['A', '10‐12', 'van', '.']),
|
||||
('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']),
|
||||
('A 10‐12-ben.', ['A', '10‐12-ben', '.']),
|
||||
('A 10‑12 van.', ['A', '10‑12', 'van', '.']),
|
||||
('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']),
|
||||
('A 10‑12-ben.', ['A', '10‑12-ben', '.']),
|
||||
('A 10‒12 van.', ['A', '10‒12', 'van', '.']),
|
||||
('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']),
|
||||
('A 10‒12-ben.', ['A', '10‒12-ben', '.']),
|
||||
('A 10–12 van.', ['A', '10–12', 'van', '.']),
|
||||
('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']),
|
||||
('A 10–12-ben.', ['A', '10–12-ben', '.']),
|
||||
('A 10—12 van.', ['A', '10—12', 'van', '.']),
|
||||
('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']),
|
||||
('A 10—12-ben.', ['A', '10—12-ben', '.']),
|
||||
('A 10―12 van.', ['A', '10―12', 'van', '.']),
|
||||
('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']),
|
||||
('A 10―12-ben.', ['A', '10―12-ben', '.']),
|
||||
('A -23,12 van.', ['A', '-23,12', 'van', '.']),
|
||||
('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']),
|
||||
('A -23,12-ben.', ['A', '-23,12-ben', '.']),
|
||||
('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']),
|
||||
('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']),
|
||||
('A C++ van.', ['A', 'C++', 'van', '.']),
|
||||
('A C++-ben van.', ['A', 'C++-ben', 'van', '.']),
|
||||
('A C++.', ['A', 'C++', '.']),
|
||||
('A C++-ben.', ['A', 'C++-ben', '.']),
|
||||
('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']),
|
||||
('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']),
|
||||
('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']),
|
||||
('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']),
|
||||
('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']),
|
||||
('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']),
|
||||
('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']),
|
||||
('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']),
|
||||
('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']),
|
||||
('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']),
|
||||
('A IV. 12.', ['A', 'IV.', '12.']),
|
||||
('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']),
|
||||
('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']),
|
||||
('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']),
|
||||
('A 2003.01.06.', ['A', '2003.01.06.']),
|
||||
('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']),
|
||||
('A IV.12. van.', ['A', 'IV.12.', 'van', '.']),
|
||||
('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']),
|
||||
('A IV.12.', ['A', 'IV.12.']),
|
||||
('A IV.12-ben.', ['A', 'IV.12-ben', '.']),
|
||||
('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']),
|
||||
('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']),
|
||||
('A 1.1.2.', ['A', '1.1.2.']),
|
||||
('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']),
|
||||
('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']),
|
||||
('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']),
|
||||
('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']),
|
||||
('A 3,14 van.', ['A', '3,14', 'van', '.']),
|
||||
('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']),
|
||||
('A 3,14-ben.', ['A', '3,14-ben', '.']),
|
||||
('A 3.14 van.', ['A', '3.14', 'van', '.']),
|
||||
('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']),
|
||||
('A 3.14-ben.', ['A', '3.14-ben', '.']),
|
||||
('A 15. van.', ['A', '15.', 'van', '.']),
|
||||
('A 15-ben van.', ['A', '15-ben', 'van', '.']),
|
||||
('A 15-ben.', ['A', '15-ben', '.']),
|
||||
('A 15.-ben van.', ['A', '15.-ben', 'van', '.']),
|
||||
('A 15.-ben.', ['A', '15.-ben', '.']),
|
||||
('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']),
|
||||
('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']),
|
||||
('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']),
|
||||
('A -0,99% van.', ['A', '-0,99%', 'van', '.']),
|
||||
('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']),
|
||||
('A -0,99%.', ['A', '-0,99%', '.']),
|
||||
('A -0,99%-ben.', ['A', '-0,99%-ben', '.']),
|
||||
('A 10--20% van.', ['A', '10--20%', 'van', '.']),
|
||||
('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']),
|
||||
('A 10--20%.', ['A', '10--20%', '.']),
|
||||
('A 10--20%-ben.', ['A', '10--20%-ben', '.']),
|
||||
('A 99§ van.', ['A', '99§', 'van', '.']),
|
||||
('A 99§-ben van.', ['A', '99§-ben', 'van', '.']),
|
||||
('A 99§-ben.', ['A', '99§-ben', '.']),
|
||||
('A 10--20§ van.', ['A', '10--20§', 'van', '.']),
|
||||
('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']),
|
||||
('A 10--20§-ben.', ['A', '10--20§-ben', '.']),
|
||||
('A 99° van.', ['A', '99°', 'van', '.']),
|
||||
('A 99°-ben van.', ['A', '99°-ben', 'van', '.']),
|
||||
('A 99°-ben.', ['A', '99°-ben', '.']),
|
||||
('A 10--20° van.', ['A', '10--20°', 'van', '.']),
|
||||
('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']),
|
||||
('A 10--20°-ben.', ['A', '10--20°-ben', '.']),
|
||||
('A °C van.', ['A', '°C', 'van', '.']),
|
||||
('A °C-ben van.', ['A', '°C-ben', 'van', '.']),
|
||||
('A °C.', ['A', '°C', '.']),
|
||||
('A °C-ben.', ['A', '°C-ben', '.']),
|
||||
('A 100°C van.', ['A', '100°C', 'van', '.']),
|
||||
('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']),
|
||||
('A 100°C.', ['A', '100°C', '.']),
|
||||
('A 100°C-ben.', ['A', '100°C-ben', '.']),
|
||||
('A 800x600 van.', ['A', '800x600', 'van', '.']),
|
||||
('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']),
|
||||
('A 800x600-ben.', ['A', '800x600-ben', '.']),
|
||||
('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']),
|
||||
('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']),
|
||||
('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']),
|
||||
('A 5/J van.', ['A', '5/J', 'van', '.']),
|
||||
('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']),
|
||||
('A 5/J-ben.', ['A', '5/J-ben', '.']),
|
||||
('A 5/J. van.', ['A', '5/J.', 'van', '.']),
|
||||
('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']),
|
||||
('A 5/J.-ben.', ['A', '5/J.-ben', '.']),
|
||||
('A III/1 van.', ['A', 'III/1', 'van', '.']),
|
||||
('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']),
|
||||
('A III/1-ben.', ['A', 'III/1-ben', '.']),
|
||||
('A III/1. van.', ['A', 'III/1.', 'van', '.']),
|
||||
('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']),
|
||||
('A III/1.-ben.', ['A', 'III/1.-ben', '.']),
|
||||
('A III/c van.', ['A', 'III/c', 'van', '.']),
|
||||
('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']),
|
||||
('A III/c.', ['A', 'III/c', '.']),
|
||||
('A III/c-ben.', ['A', 'III/c-ben', '.']),
|
||||
('A TU–154 van.', ['A', 'TU–154', 'van', '.']),
|
||||
('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']),
|
||||
('A TU–154-ben.', ['A', 'TU–154-ben', '.'])]
|
||||
|
||||
def __init__(self, input_str, expected_words):
|
||||
self.input = input_str
|
||||
self.expected_tokens = expected_words
|
||||
_QUTE_TESTS = [('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
|
||||
('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']),
|
||||
('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']),
|
||||
('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']),
|
||||
("A don't van.", ['A', "don't", 'van', '.'])]
|
||||
|
||||
def __repr__(self):
|
||||
return "TokenizerTestCase<input={}, words={}>".format(repr(self.input), self.expected_tokens)
|
||||
|
||||
def to_tuple(self):
|
||||
return (self.input, self.expected_tokens)
|
||||
|
||||
@classmethod
|
||||
def _parse_output_line(cls, line):
|
||||
for match in cls.WORD_PATTERN.finditer(line):
|
||||
yield match.group(2)
|
||||
|
||||
@classmethod
|
||||
def read_from_file(cls, path):
|
||||
with open(path) as f:
|
||||
input_lines = []
|
||||
output_words = []
|
||||
last_type = None
|
||||
for line in f:
|
||||
if line.startswith(cls.INPUT_PREFIX):
|
||||
if last_type == TokenizerTestCase.OUTPUT_PREFIX and input_lines:
|
||||
yield TokenizerTestCase("\n".join(input_lines), output_words)
|
||||
input_lines = []
|
||||
output_words = []
|
||||
input_lines.append(line[len(cls.INPUT_PREFIX):].strip())
|
||||
last_type = TokenizerTestCase.INPUT_PREFIX
|
||||
elif line.startswith(cls.OUTPUT_PREFIX):
|
||||
output_words.extend(list(cls._parse_output_line(line.strip())))
|
||||
last_type = TokenizerTestCase.OUTPUT_PREFIX
|
||||
else:
|
||||
# Comments separate test cases
|
||||
if input_lines:
|
||||
yield TokenizerTestCase("\n".join(input_lines), output_words)
|
||||
input_lines = []
|
||||
output_words = []
|
||||
last_type = None
|
||||
|
||||
|
||||
_DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt"))
|
||||
_HYPHEN_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_hyphen.txt"))
|
||||
_QUOTE_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_quote.txt"))
|
||||
_NUMBER_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_numbers.txt"))
|
||||
_MISC_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_misc.txt"))
|
||||
_IT_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_it.txt"))
|
||||
|
||||
# TODO: Until this get fixed we cannot really test the urls: https://github.com/explosion/spaCy/issues/344
|
||||
ALL_TESTCASES = _DOTS_CASES + _HYPHEN_CASES + _QUOTE_CASES + _NUMBER_CASES + _MISC_CASES # + _IT_CASES
|
||||
_DOT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
||||
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
||||
('A .hu.', ['A', '.hu', '.']),
|
||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||
('A pl.', ['A', 'pl.']),
|
||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||
('Egy..ket.', ['Egy', '..', 'ket', '.']),
|
||||
('Valami... van.', ['Valami', '...', 'van', '.']),
|
||||
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
||||
('Valami...', ['Valami', '...']),
|
||||
('Valami ...', ['Valami', '...']),
|
||||
('Valami ... más.', ['Valami', '...', 'más', '.'])]
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
@ -75,8 +223,9 @@ def hu_tokenizer(HU):
|
|||
return HU.tokenizer
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("test_case"), ALL_TESTCASES)
|
||||
def test_testcases(hu_tokenizer, test_case):
|
||||
tokens = hu_tokenizer(test_case.input)
|
||||
@pytest.mark.parametrize(("input", "expected_tokens"),
|
||||
_DEFAULT_TESTS + _HYPHEN_TESTS + _NUMBER_TESTS + _DOT_TESTS + _QUTE_TESTS)
|
||||
def test_testcases(hu_tokenizer, input, expected_tokens):
|
||||
tokens = hu_tokenizer(input)
|
||||
token_list = [token.orth_ for token in tokens if not token.is_space]
|
||||
assert test_case.expected_tokens == token_list # , "{} was erronously tokenized as {}".format(test_case, token_list)
|
||||
assert expected_tokens == token_list
|
||||
|
|
Loading…
Reference in New Issue
Block a user