diff --git a/.github/contributors/DoomCoder.md b/.github/contributors/DoomCoder.md new file mode 100644 index 000000000..0b9938bdc --- /dev/null +++ b/.github/contributors/DoomCoder.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Piotr Książek | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 22.11.2018 | +| GitHub username | DoomCoder | +| Website (optional) | | diff --git a/.github/contributors/Gizzio.md b/.github/contributors/Gizzio.md new file mode 100644 index 000000000..b9ca424d9 --- /dev/null +++ b/.github/contributors/Gizzio.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Stanisław Giziński | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 21.11.2018 | +| GitHub username | Gizzio | +| Website (optional) | | \ No newline at end of file diff --git a/.github/contributors/MateuszOlko.md b/.github/contributors/MateuszOlko.md new file mode 100644 index 000000000..04467c749 --- /dev/null +++ b/.github/contributors/MateuszOlko.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Mateusz Olko | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 22.11.2018 | +| GitHub username | MateuszOlko | +| Website (optional) | | diff --git a/.github/contributors/kowaalczyk.md b/.github/contributors/kowaalczyk.md new file mode 100644 index 000000000..c367c913d --- /dev/null +++ b/.github/contributors/kowaalczyk.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name |Krzysztof Kowalczyk | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date |22.11.2018 | +| GitHub username |kowaalczyk | +| Website (optional) |kowaalczyk.pl | diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index d77a9cb51..9922db89e 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -4,6 +4,8 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -14,11 +16,13 @@ from ...util import update_exc, add_lookups class PolishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "pl" lex_attr_getters[NORM] = add_lookups( Language.Defaults.lex_attr_getters[NORM], BASE_NORMS ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + infixes = tuple(TOKENIZER_INFIXES) stop_words = STOP_WORDS tag_map = TAG_MAP diff --git a/spacy/lang/pl/_tokenizer_exceptions_list.py b/spacy/lang/pl/_tokenizer_exceptions_list.py new file mode 100644 index 000000000..ae8806796 --- /dev/null +++ b/spacy/lang/pl/_tokenizer_exceptions_list.py @@ -0,0 +1,1441 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals + +# The following list consists of: +# - exceptions generated from polish_srx_rules [1] +# (https://github.com/milekpl/polish_srx_rules) +# - abbreviations parsed from Wikipedia +# - some manually added exceptions +# +# [1] M. Miłkowski and J. Lipski, +# "Using SRX Standard for Sentence Segmentation," in LTC 2009, +# Lecture Notes in Artificial Intelligence 6562, +# Z. Vetulani, Ed. Berlin Heidelberg: Springer-Verlag, 2011, pp. 172–182. +PL_BASE_EXCEPTIONS = ['0.', +'1.', +'10.', +'2.', +'3.', +'4.', +'5.', +'6.', +'7.', +'8.', +'9.', +'A.A.', +'A.B.', +'A.C.', +'A.D.', +'A.E.', +'A.F.', +'A.G.', +'A.H.', +'A.I.', +'A.J.', +'A.K.', +'A.L.', +'A.M.', +'A.N.', +'A.O.', +'A.P.', +'A.R.', +'A.S.', +'A.T.', +'A.U.', +'A.W.', +'A.Y.', +'A.Z.', +'A.Ó.', +'A.Ą.', +'A.Ć.', +'A.Ę.', +'A.Ł.', +'A.Ń.', +'A.Ś.', +'A.Ź.', +'A.Ż.', +'Ad.', +'Adw.', +'Al.', +'Art.', +'B.A.', +'B.B.', +'B.C.', +'B.D.', +'B.E.', +'B.F.', +'B.G.', +'B.H.', +'B.I.', +'B.J.', +'B.K.', +'B.L.', +'B.M.', +'B.N.', +'B.O.', +'B.P.', +'B.R.', +'B.S.', +'B.T.', +'B.U.', +'B.W.', +'B.Y.', +'B.Z.', +'B.Ó.', +'B.Ą.', +'B.Ć.', +'B.Ę.', +'B.Ł.', +'B.Ń.', +'B.Ś.', +'B.Ź.', +'B.Ż.', +'D.A.', +'D.B.', +'D.C.', +'D.D.', +'D.E.', +'D.F.', +'D.G.', +'D.H.', +'D.I.', +'D.J.', +'D.K.', +'D.L.', +'D.M.', +'D.N.', +'D.O.', +'D.P.', +'D.R.', +'D.S.', +'D.T.', +'D.U.', +'D.W.', +'D.Y.', +'D.Z.', +'D.Ó.', +'D.Ą.', +'D.Ć.', +'D.Ę.', +'D.Ł.', +'D.Ń.', +'D.Ś.', +'D.Ź.', +'D.Ż.', +'Dh.', +'Doc.', +'Dr.', +'Dyr.', +'Dyw.', +'Dz.U.', +'E.A.', +'E.B.', +'E.C.', +'E.D.', +'E.E.', +'E.F.', +'E.G.', +'E.H.', +'E.I.', +'E.J.', +'E.K.', +'E.L.', +'E.M.', +'E.N.', +'E.O.', +'E.P.', +'E.R.', +'E.S.', +'E.T.', +'E.U.', +'E.W.', +'E.Y.', +'E.Z.', +'E.Ó.', +'E.Ą.', +'E.Ć.', +'E.Ę.', +'E.Ł.', +'E.Ń.', +'E.Ś.', +'E.Ź.', +'E.Ż.', +'F.A.', +'F.B.', +'F.C.', +'F.D.', +'F.E.', +'F.F.', +'F.G.', +'F.H.', +'F.I.', +'F.J.', +'F.K.', +'F.L.', +'F.M.', +'F.N.', +'F.O.', +'F.P.', +'F.R.', +'F.S.', +'F.T.', +'F.U.', +'F.W.', +'F.Y.', +'F.Z.', +'F.Ó.', +'F.Ą.', +'F.Ć.', +'F.Ę.', +'F.Ł.', +'F.Ń.', +'F.Ś.', +'F.Ź.', +'F.Ż.', +'G.A.', +'G.B.', +'G.C.', +'G.D.', +'G.E.', +'G.F.', +'G.G.', +'G.H.', +'G.I.', +'G.J.', +'G.K.', +'G.L.', +'G.M.', +'G.N.', +'G.O.', +'G.P.', +'G.R.', +'G.S.', +'G.T.', +'G.U.', +'G.W.', +'G.Y.', +'G.Z.', +'G.Ó.', +'G.Ą.', +'G.Ć.', +'G.Ę.', +'G.Ł.', +'G.Ń.', +'G.Ś.', +'G.Ź.', +'G.Ż.', +'H.A.', +'H.B.', +'H.C.', +'H.D.', +'H.E.', +'H.F.', +'H.G.', +'H.H.', +'H.I.', +'H.J.', +'H.K.', +'H.L.', +'H.M.', +'H.N.', +'H.O.', +'H.P.', +'H.R.', +'H.S.', +'H.T.', +'H.U.', +'H.W.', +'H.Y.', +'H.Z.', +'H.Ó.', +'H.Ą.', +'H.Ć.', +'H.Ę.', +'H.Ł.', +'H.Ń.', +'H.Ś.', +'H.Ź.', +'H.Ż.', +'Hr.', +'I.A.', +'I.B.', +'I.C.', +'I.D.', +'I.E.', +'I.F.', +'I.G.', +'I.H.', +'I.I.', +'I.J.', +'I.K.', +'I.L.', +'I.M.', +'I.N.', +'I.O.', +'I.P.', +'I.R.', +'I.S.', +'I.T.', +'I.U.', +'I.W.', +'I.Y.', +'I.Z.', +'I.Ó.', +'I.Ą.', +'I.Ć.', +'I.Ę.', +'I.Ł.', +'I.Ń.', +'I.Ś.', +'I.Ź.', +'I.Ż.', +'Inż.', +'J.A.', +'J.B.', +'J.C.', +'J.D.', +'J.E.', +'J.F.', +'J.G.', +'J.H.', +'J.I.', +'J.J.', +'J.K.', +'J.L.', +'J.M.', +'J.N.', +'J.O.', +'J.P.', +'J.R.', +'J.S.', +'J.T.', +'J.U.', +'J.W.', +'J.Y.', +'J.Z.', +'J.Ó.', +'J.Ą.', +'J.Ć.', +'J.Ę.', +'J.Ł.', +'J.Ń.', +'J.Ś.', +'J.Ź.', +'J.Ż.', +'K.A.', +'K.B.', +'K.C.', +'K.D.', +'K.E.', +'K.F.', +'K.G.', +'K.H.', +'K.I.', +'K.J.', +'K.K.', +'K.L.', +'K.M.', +'K.N.', +'K.O.', +'K.P.', +'K.R.', +'K.S.', +'K.T.', +'K.U.', +'K.W.', +'K.Y.', +'K.Z.', +'K.Ó.', +'K.Ą.', +'K.Ć.', +'K.Ę.', +'K.Ł.', +'K.Ń.', +'K.Ś.', +'K.Ź.', +'K.Ż.', +'Ks.', +'L.A.', +'L.B.', +'L.C.', +'L.D.', +'L.E.', +'L.F.', +'L.G.', +'L.H.', +'L.I.', +'L.J.', +'L.K.', +'L.L.', +'L.M.', +'L.N.', +'L.O.', +'L.P.', +'L.R.', +'L.S.', +'L.T.', +'L.U.', +'L.W.', +'L.Y.', +'L.Z.', +'L.Ó.', +'L.Ą.', +'L.Ć.', +'L.Ę.', +'L.Ł.', +'L.Ń.', +'L.Ś.', +'L.Ź.', +'L.Ż.', +'Lek.', +'M.A.', +'M.B.', +'M.C.', +'M.D.', +'M.E.', +'M.F.', +'M.G.', +'M.H.', +'M.I.', +'M.J.', +'M.K.', +'M.L.', +'M.M.', +'M.N.', +'M.O.', +'M.P.', +'M.R.', +'M.S.', +'M.T.', +'M.U.', +'M.W.', +'M.Y.', +'M.Z.', +'M.Ó.', +'M.Ą.', +'M.Ć.', +'M.Ę.', +'M.Ł.', +'M.Ń.', +'M.Ś.', +'M.Ź.', +'M.Ż.', +'Mat.', +'Mec.', +'Mojż.', +'N.A.', +'N.B.', +'N.C.', +'N.D.', +'N.E.', +'N.F.', +'N.G.', +'N.H.', +'N.I.', +'N.J.', +'N.K.', +'N.L.', +'N.M.', +'N.N.', +'N.O.', +'N.P.', +'N.R.', +'N.S.', +'N.T.', +'N.U.', +'N.W.', +'N.Y.', +'N.Z.', +'N.Ó.', +'N.Ą.', +'N.Ć.', +'N.Ę.', +'N.Ł.', +'N.Ń.', +'N.Ś.', +'N.Ź.', +'N.Ż.', +'Na os.', +'Nadkom.', +'Najśw.', +'Nb.', +'Np.', +'O.A.', +'O.B.', +'O.C.', +'O.D.', +'O.E.', +'O.F.', +'O.G.', +'O.H.', +'O.I.', +'O.J.', +'O.K.', +'O.L.', +'O.M.', +'O.N.', +'O.O.', +'O.P.', +'O.R.', +'O.S.', +'O.T.', +'O.U.', +'O.W.', +'O.Y.', +'O.Z.', +'O.Ó.', +'O.Ą.', +'O.Ć.', +'O.Ę.', +'O.Ł.', +'O.Ń.', +'O.Ś.', +'O.Ź.', +'O.Ż.', +'OO.', +'Oo.', +'P.A.', +'P.B.', +'P.C.', +'P.D.', +'P.E.', +'P.F.', +'P.G.', +'P.H.', +'P.I.', +'P.J.', +'P.K.', +'P.L.', +'P.M.', +'P.N.', +'P.O.', +'P.P.', +'P.R.', +'P.S.', +'P.T.', +'P.U.', +'P.W.', +'P.Y.', +'P.Z.', +'P.Ó.', +'P.Ą.', +'P.Ć.', +'P.Ę.', +'P.Ł.', +'P.Ń.', +'P.Ś.', +'P.Ź.', +'P.Ż.', +'Podkom.', +'Przyp.', +'Ps.', +'Pt.', +'Płk.', +'R.A.', +'R.B.', +'R.C.', +'R.D.', +'R.E.', +'R.F.', +'R.G.', +'R.H.', +'R.I.', +'R.J.', +'R.K.', +'R.L.', +'R.M.', +'R.N.', +'R.O.', +'R.P.', +'R.R.', +'R.S.', +'R.T.', +'R.U.', +'R.W.', +'R.Y.', +'R.Z.', +'R.Ó.', +'R.Ą.', +'R.Ć.', +'R.Ę.', +'R.Ł.', +'R.Ń.', +'R.Ś.', +'R.Ź.', +'R.Ż.', +'Red.', +'Reż.', +'Ryc.', +'Rys.', +'S.A.', +'S.B.', +'S.C.', +'S.D.', +'S.E.', +'S.F.', +'S.G.', +'S.H.', +'S.I.', +'S.J.', +'S.K.', +'S.L.', +'S.M.', +'S.N.', +'S.O.', +'S.P.', +'S.R.', +'S.S.', +'S.T.', +'S.U.', +'S.W.', +'S.Y.', +'S.Z.', +'S.Ó.', +'S.Ą.', +'S.Ć.', +'S.Ę.', +'S.Ł.', +'S.Ń.', +'S.Ś.', +'S.Ź.', +'S.Ż.', +'Sp.', +'Spółdz.', +'Stow.', +'Stoł.', +'Sz.P.', +'Szer.', +'T.A.', +'T.B.', +'T.C.', +'T.D.', +'T.E.', +'T.F.', +'T.G.', +'T.H.', +'T.I.', +'T.J.', +'T.K.', +'T.L.', +'T.M.', +'T.N.', +'T.O.', +'T.P.', +'T.R.', +'T.S.', +'T.T.', +'T.U.', +'T.W.', +'T.Y.', +'T.Z.', +'T.Ó.', +'T.Ą.', +'T.Ć.', +'T.Ę.', +'T.Ł.', +'T.Ń.', +'T.Ś.', +'T.Ź.', +'T.Ż.', +'Tow.', +'Tzw.', +'U.A.', +'U.B.', +'U.C.', +'U.D.', +'U.E.', +'U.F.', +'U.G.', +'U.H.', +'U.I.', +'U.J.', +'U.K.', +'U.L.', +'U.M.', +'U.N.', +'U.O.', +'U.P.', +'U.R.', +'U.S.', +'U.T.', +'U.U.', +'U.W.', +'U.Y.', +'U.Z.', +'U.Ó.', +'U.Ą.', +'U.Ć.', +'U.Ę.', +'U.Ł.', +'U.Ń.', +'U.Ś.', +'U.Ź.', +'U.Ż.', +'W.A.', +'W.B.', +'W.C.', +'W.D.', +'W.E.', +'W.F.', +'W.G.', +'W.H.', +'W.I.', +'W.J.', +'W.K.', +'W.L.', +'W.M.', +'W.N.', +'W.O.', +'W.P.', +'W.R.', +'W.S.', +'W.T.', +'W.U.', +'W.W.', +'W.Y.', +'W.Z.', +'W.Ó.', +'W.Ą.', +'W.Ć.', +'W.Ę.', +'W.Ł.', +'W.Ń.', +'W.Ś.', +'W.Ź.', +'W.Ż.', +'Y.A.', +'Y.B.', +'Y.C.', +'Y.D.', +'Y.E.', +'Y.F.', +'Y.G.', +'Y.H.', +'Y.I.', +'Y.J.', +'Y.K.', +'Y.L.', +'Y.M.', +'Y.N.', +'Y.O.', +'Y.P.', +'Y.R.', +'Y.S.', +'Y.T.', +'Y.U.', +'Y.W.', +'Y.Y.', +'Y.Z.', +'Y.Ó.', +'Y.Ą.', +'Y.Ć.', +'Y.Ę.', +'Y.Ł.', +'Y.Ń.', +'Y.Ś.', +'Y.Ź.', +'Y.Ż.', +'Z.A.', +'Z.B.', +'Z.C.', +'Z.D.', +'Z.E.', +'Z.F.', +'Z.G.', +'Z.H.', +'Z.I.', +'Z.J.', +'Z.K.', +'Z.L.', +'Z.M.', +'Z.N.', +'Z.O.', +'Z.P.', +'Z.R.', +'Z.S.', +'Z.T.', +'Z.U.', +'Z.W.', +'Z.Y.', +'Z.Z.', +'Z.Ó.', +'Z.Ą.', +'Z.Ć.', +'Z.Ę.', +'Z.Ł.', +'Z.Ń.', +'Z.Ś.', +'Z.Ź.', +'Z.Ż.', +'Zob.', +'a.', +'ad.', +'adw.', +'afr.', +'ags.', +'akad.', +'al.', +'alb.', +'am.', +'amer.', +'ang.', +'aor.', +'ap.', +'apost.', +'arch.', +'arcyks.', +'art.', +'artyst.', +'asp.', +'astr.', +'aust.', +'austr.', +'austral.', +'b.', +'bałt.', +'bdb.', +'belg.', +'białorus.', +'białost.', +'bm.', +'bot.', +'bp.', +'br.', +'bryg.', +'bryt.', +'bułg.', +'bł.', +'c.b.d.o.', +'c.k.', +'c.o.', +'cbdu.', +'cd.', +'cdn.', +'centr.', +'ces.', +'chem.', +'chir.', +'chiń.', +'chor.', +'chorw.', +'cieśn.', +'cnd.', +'cyg.', +'cyt.', +'cyw.', +'cz.', +'czes.', +'czw.', +'czyt.', +'d.', +'daw.', +'dcn.', +'dekl.', +'demokr.', +'det.', +'dh.', +'diec.', +'dk.', +'dn.', +'doc.', +'doktor h.c.', +'dol.', +'dolnośl.', +'dost.', +'dosł.', +'dot.', +'dr h.c.', +'dr hab.', +'dr.', +'ds.', +'dst.', +'duszp.', +'dypl.', +'dyr.', +'dyw.', +'dł.', +'egz.', +'ekol.', +'ekon.', +'elektr.', +'em.', +'ent.', +'est.', +'europ.', +'ew.', +'fab.', +'farm.', +'fot.', +'fr.', +'franc.', +'g.', +'gastr.', +'gat.', +'gd.', +'gen.', +'geogr.', +'geol.', +'gimn.', +'gm.', +'godz.', +'gorz.', +'gosp.', +'gosp.-polit.', +'gr.', +'gram.', +'grub.', +'górn.', +'głęb.', +'h.c.', +'hab.', +'hist.', +'hiszp.', +'hitl.', +'hm.', +'hot.', +'hr.', +'i in.', +'i s.', +'id.', +'ie.', +'im.', +'in.', +'inż.', +'iron.', +'itd.', +'itp.', +'j.', +'j.a.', +'jez.', +'jn.', +'jw.', +'jwt.', +'k.', +'k.k.', +'k.o.', +'k.p.a.', +'k.p.c.', +'k.r.', +'k.r.o.', +'kard.', +'kark.', +'kasz.', +'kat.', +'katol.', +'kier.', +'kk.', +'kl.', +'kol.', +'kpc.', +'kpt.', +'kr.', +'krak.', +'kryt.', +'ks.', +'książk.', +'kuj.', +'kult.', +'kł.', +'l.', +'laic.', +'lek.', +'lit.', +'lp.', +'lub.', +'m.', +'m.b.', +'m.in.', +'m.p.', +'m.st.', +'mar.', +'maz.', +'małop.', +'mec.', +'med.', +'mgr.', +'min.', +'mn.', +'mn.w.', +'muz.', +'mł.', +'n.', +'n.e.', +'n.p.m.', +'n.p.u.', +'na os.', +'nadkom.', +'najśw.', +'nb.', +'niedz.', +'niem.', +'norw.', +'np.', +'nt.', +'nż.', +'o s.', +'o.', +'oO.', +'ob.', +'odc.', +'odp.', +'ok.', +'oo.', +'op.', +'os.', +'p.', +'p.a.', +'p.f.', +'p.f.v.', +'p.n.e.', +'p.o.', +'p.p.', +'p.p.m.', +'p.r.', +'p.r.v.', +'phm.', +'pie.', +'pl.', +'pn.', +'pocz.', +'pod.', +'podgat.', +'podkarp.', +'podkom.', +'poet.', +'poj.', +'pok.', +'pol.', +'pom.', +'pon.', +'poprz.', +'por.', +'port.', +'posp.', +'pow.', +'poz.', +'poł.', +'pp.', +'ppanc.', +'ppor.', +'ppoż.', +'prawdop.', +'proc.', +'prof.', +'prok.', +'przed Chr.', +'przyp.', +'ps.', +'pseud.', +'pt.', +'pw.', +'półn.', +'płd.', +'płk.', +'płn.', +'r.', +'r.ż.', +'red.', +'reż.', +'ros.', +'rozdz.', +'rtg.', +'rtm.', +'rub.', +'rum.', +'ryc.', +'rys.', +'rz.', +'s.', +'serb.', +'sierż.', +'skr.', +'sob.', +'sp.', +'społ.', +'spółdz.', +'spółgł.', +'st.', +'st.rus.', +'stow.', +'stoł.', +'str.', +'sud.', +'szczec.', +'szer.', +'szt.', +'szw.', +'szwajc.', +'słow.', +'t.', +'t.j.', +'tatrz.', +'tel.', +'tj.', +'tow.', +'trl.', +'tryb.', +'ts.', +'tur.', +'tys.', +'tzn.', +'tzw.', +'tłum.', +'u s.', +'ub.', +'ukr.', +'ul.', +'up.', +'ur.', +'v.v.', +'vs.', +'w.', +'warm.', +'wlk.', +'wlkp.', +'woj.', +'wroc.', +'ws.', +'wsch.', +'wt.', +'ww.', +'wyb.', +'wyd.', +'wyj.', +'wym.', +'wyst.', +'wył.', +'wyż.', +'wzgl.', +'wędr.', +'węg.', +'wł.', +'x.', +'xx.', +'zach.', +'zagr.', +'zak.', +'zakł.', +'zal.', +'zam.', +'zast.', +'zaw.', +'zazw.', +'zał.', +'zdr.', +'zew.', +'zewn.', +'ziel.', +'zm.', +'zn.', +'zob.', +'zool.', +'zw.', +'ząbk.', +'Ó.A.', +'Ó.B.', +'Ó.C.', +'Ó.D.', +'Ó.E.', +'Ó.F.', +'Ó.G.', +'Ó.H.', +'Ó.I.', +'Ó.J.', +'Ó.K.', +'Ó.L.', +'Ó.M.', +'Ó.N.', +'Ó.O.', +'Ó.P.', +'Ó.R.', +'Ó.S.', +'Ó.T.', +'Ó.U.', +'Ó.W.', +'Ó.Y.', +'Ó.Z.', +'Ó.Ó.', +'Ó.Ą.', +'Ó.Ć.', +'Ó.Ę.', +'Ó.Ł.', +'Ó.Ń.', +'Ó.Ś.', +'Ó.Ź.', +'Ó.Ż.', +'Ą.A.', +'Ą.B.', +'Ą.C.', +'Ą.D.', +'Ą.E.', +'Ą.F.', +'Ą.G.', +'Ą.H.', +'Ą.I.', +'Ą.J.', +'Ą.K.', +'Ą.L.', +'Ą.M.', +'Ą.N.', +'Ą.O.', +'Ą.P.', +'Ą.R.', +'Ą.S.', +'Ą.T.', +'Ą.U.', +'Ą.W.', +'Ą.Y.', +'Ą.Z.', +'Ą.Ó.', +'Ą.Ą.', +'Ą.Ć.', +'Ą.Ę.', +'Ą.Ł.', +'Ą.Ń.', +'Ą.Ś.', +'Ą.Ź.', +'Ą.Ż.', +'Ć.A.', +'Ć.B.', +'Ć.C.', +'Ć.D.', +'Ć.E.', +'Ć.F.', +'Ć.G.', +'Ć.H.', +'Ć.I.', +'Ć.J.', +'Ć.K.', +'Ć.L.', +'Ć.M.', +'Ć.N.', +'Ć.O.', +'Ć.P.', +'Ć.R.', +'Ć.S.', +'Ć.T.', +'Ć.U.', +'Ć.W.', +'Ć.Y.', +'Ć.Z.', +'Ć.Ó.', +'Ć.Ą.', +'Ć.Ć.', +'Ć.Ę.', +'Ć.Ł.', +'Ć.Ń.', +'Ć.Ś.', +'Ć.Ź.', +'Ć.Ż.', +'ćw.', +'ćwicz.', +'Ę.A.', +'Ę.B.', +'Ę.C.', +'Ę.D.', +'Ę.E.', +'Ę.F.', +'Ę.G.', +'Ę.H.', +'Ę.I.', +'Ę.J.', +'Ę.K.', +'Ę.L.', +'Ę.M.', +'Ę.N.', +'Ę.O.', +'Ę.P.', +'Ę.R.', +'Ę.S.', +'Ę.T.', +'Ę.U.', +'Ę.W.', +'Ę.Y.', +'Ę.Z.', +'Ę.Ó.', +'Ę.Ą.', +'Ę.Ć.', +'Ę.Ę.', +'Ę.Ł.', +'Ę.Ń.', +'Ę.Ś.', +'Ę.Ź.', +'Ę.Ż.', +'Ł.A.', +'Ł.B.', +'Ł.C.', +'Ł.D.', +'Ł.E.', +'Ł.F.', +'Ł.G.', +'Ł.H.', +'Ł.I.', +'Ł.J.', +'Ł.K.', +'Ł.L.', +'Ł.M.', +'Ł.N.', +'Ł.O.', +'Ł.P.', +'Ł.R.', +'Ł.S.', +'Ł.T.', +'Ł.U.', +'Ł.W.', +'Ł.Y.', +'Ł.Z.', +'Ł.Ó.', +'Ł.Ą.', +'Ł.Ć.', +'Ł.Ę.', +'Ł.Ł.', +'Ł.Ń.', +'Ł.Ś.', +'Ł.Ź.', +'Ł.Ż.', +'Łuk.', +'łac.', +'łot.', +'łow.', +'Ń.A.', +'Ń.B.', +'Ń.C.', +'Ń.D.', +'Ń.E.', +'Ń.F.', +'Ń.G.', +'Ń.H.', +'Ń.I.', +'Ń.J.', +'Ń.K.', +'Ń.L.', +'Ń.M.', +'Ń.N.', +'Ń.O.', +'Ń.P.', +'Ń.R.', +'Ń.S.', +'Ń.T.', +'Ń.U.', +'Ń.W.', +'Ń.Y.', +'Ń.Z.', +'Ń.Ó.', +'Ń.Ą.', +'Ń.Ć.', +'Ń.Ę.', +'Ń.Ł.', +'Ń.Ń.', +'Ń.Ś.', +'Ń.Ź.', +'Ń.Ż.', +'Ś.A.', +'Ś.B.', +'Ś.C.', +'Ś.D.', +'Ś.E.', +'Ś.F.', +'Ś.G.', +'Ś.H.', +'Ś.I.', +'Ś.J.', +'Ś.K.', +'Ś.L.', +'Ś.M.', +'Ś.N.', +'Ś.O.', +'Ś.P.', +'Ś.R.', +'Ś.S.', +'Ś.T.', +'Ś.U.', +'Ś.W.', +'Ś.Y.', +'Ś.Z.', +'Ś.Ó.', +'Ś.Ą.', +'Ś.Ć.', +'Ś.Ę.', +'Ś.Ł.', +'Ś.Ń.', +'Ś.Ś.', +'Ś.Ź.', +'Ś.Ż.', +'ŚW.', +'Śp.', +'Św.', +'śW.', +'śl.', +'śp.', +'śr.', +'św.', +'Ź.A.', +'Ź.B.', +'Ź.C.', +'Ź.D.', +'Ź.E.', +'Ź.F.', +'Ź.G.', +'Ź.H.', +'Ź.I.', +'Ź.J.', +'Ź.K.', +'Ź.L.', +'Ź.M.', +'Ź.N.', +'Ź.O.', +'Ź.P.', +'Ź.R.', +'Ź.S.', +'Ź.T.', +'Ź.U.', +'Ź.W.', +'Ź.Y.', +'Ź.Z.', +'Ź.Ó.', +'Ź.Ą.', +'Ź.Ć.', +'Ź.Ę.', +'Ź.Ł.', +'Ź.Ń.', +'Ź.Ś.', +'Ź.Ź.', +'Ź.Ż.', +'Ż.A.', +'Ż.B.', +'Ż.C.', +'Ż.D.', +'Ż.E.', +'Ż.F.', +'Ż.G.', +'Ż.H.', +'Ż.I.', +'Ż.J.', +'Ż.K.', +'Ż.L.', +'Ż.M.', +'Ż.N.', +'Ż.O.', +'Ż.P.', +'Ż.R.', +'Ż.S.', +'Ż.T.', +'Ż.U.', +'Ż.W.', +'Ż.Y.', +'Ż.Z.', +'Ż.Ó.', +'Ż.Ą.', +'Ż.Ć.', +'Ż.Ę.', +'Ż.Ł.', +'Ż.Ń.', +'Ż.Ś.', +'Ż.Ź.', +'Ż.Ż.', +'ż.', +'żarg.', +'żart.', +'żyd.', +'żyw.'] diff --git a/spacy/lang/pl/lex_attrs.py b/spacy/lang/pl/lex_attrs.py index 886f95a11..f1379aa50 100644 --- a/spacy/lang/pl/lex_attrs.py +++ b/spacy/lang/pl/lex_attrs.py @@ -34,11 +34,22 @@ _num_words = [ "osiemdziesiąt", "dziewięćdziesiąt", "sto", + "dwieście", + "trzysta", + "czterysta", + "pięćset", + "sześćset", + "siedemset", + "osiemset", + "dziewięćset", "tysiąc", "milion", "miliard", "bilion", + "biliard", "trylion", + "tryliard", + "kwadrylion", ] diff --git a/spacy/lang/pl/polish_srx_rules_LICENSE.txt b/spacy/lang/pl/polish_srx_rules_LICENSE.txt new file mode 100644 index 000000000..995a1b0f7 --- /dev/null +++ b/spacy/lang/pl/polish_srx_rules_LICENSE.txt @@ -0,0 +1,23 @@ + +Copyright (c) 2019, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py new file mode 100644 index 000000000..8fdcaca41 --- /dev/null +++ b/spacy/lang/pl/punctuation.py @@ -0,0 +1,14 @@ +# coding: utf8 +from __future__ import unicode_literals +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +_quotes = QUOTES.replace("'", '') +_infixes = (LIST_ELLIPSES + LIST_ICONS + + [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), + r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes), + r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)]) + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/pl/stop_words.py b/spacy/lang/pl/stop_words.py index 9de6aea73..11df67328 100644 --- a/spacy/lang/pl/stop_words.py +++ b/spacy/lang/pl/stop_words.py @@ -1,48 +1,82 @@ # encoding: utf8 + from __future__ import unicode_literals - -# Source: http://www.ranks.nl/stopwords/polish +# sources: https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt and https://github.com/stopwords-iso/stopwords-pl STOP_WORDS = set( """ -ach aj albo +a aby ach acz aczkolwiek aj albo ale alez +ależ ani az aż -bardzo bez bo być +bardziej bardzo beda bede bedzie bez bo bowiem by +byc byl byla byli bylo byly bym bynajmniej być był +była było były będzie będą będę -ci cię ciebie co czy +cala cali caly cała cały chce choć ci cie +ciebie cię co cokolwiek coraz cos coś czasami czasem czemu +czy czyli często -daleko dla dlaczego dlatego do dobrze dokąd dość dużo dwa dwaj dwie dwoje dziś -dzisiaj +daleko dla dlaczego dlatego do dobrze dokad dokąd +dosc dość duzo dużo dwa dwaj dwie dwoje dzis +dzisiaj dziś -gdyby gdzie +gdy gdyby gdyz gdyż gdzie gdziekolwiek gdzies gdzieś go +godz -go +i ich ile im inna inne inny +innych iv ix iz iż -ich ile im inny +ja jak jakas jakaś jakby jaki jakichs jakichś jakie +jakis jakiz jakiś jakiż jakkolwiek jako jakos jakoś je jeden +jedna jednak jednakze jednakże jedno jednym jedynie jego jej jemu +jesli jest jestem jeszcze jezeli jeśli jeżeli juz już ją -ja ją jak jakby jaki je jeden jedna jedno jego jej jemu jeśli jest jestem -jeżeli już +kazdy każdy kiedy kierunku kilka kilku kims kimś kto +ktokolwiek ktora ktore ktorego ktorej ktory ktorych ktorym ktorzy ktos +ktoś która które którego której który których którym którzy ku -każdy kiedy kierunku kto ku +lecz lub -lub +ma mają mam mamy mało mi miał miedzy +mimo między mna mnie mną moga mogą moi moim moj +moja moje moze mozliwe mozna może możliwe można mu musi +my mój -ma mają mam mi mną mnie moi mój moja moje może mu my +na nad nam nami nas nasi nasz nasza nasze +naszego naszych natomiast natychmiast nawet nia nic nich nie niech +niego niej niemu nigdy nim nimi niz nią niż no -na nam nami nas nasi nasz nasza nasze natychmiast nią nic nich nie niego niej -niemu nigdy nim nimi niż +o obok od ok około on ona one +oni ono oraz oto owszem -obok od około on ona one oni ono owszem +pan pana pani po pod podczas pomimo ponad +poniewaz ponieważ powinien powinna powinni powinno poza prawie przeciez +przecież przed przede przedtem przez przy -po pod ponieważ przed przedtem +raz razie roku rowniez również -są sam sama się skąd +sam sama sie się skad skąd soba sobie sobą +sposob sposób swoje są -tak taki tam ten to tobą tobie tu tutaj twoi twój twoja twoje ty +ta tak taka taki takich takie takze także tam +te tego tej tel temu ten teraz też to toba +tobie tobą totez toteż totobą trzeba tu tutaj twoi twoim +twoj twoja twoje twym twój ty tych tylko tym tys +tzw tę -wam wami was wasi wasz wasza wasze we więc wszystko wtedy wy +u -żaden zawsze że -""".split() +vi vii viii + +w wam wami was wasi wasz wasza wasze we +według wie wiele wielu więc więcej wlasnie wszyscy wszystkich wszystkie +wszystkim wszystko wtedy wy właśnie wśród + +xi xii xiii xiv xv + +z za zaden zadna zadne zadnych zapewne zawsze zaś +ze zeby znow znowu znów zostal został + +żaden żadna żadne żadnych że żeby""".split() ) diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py index f06ce49e9..c16315804 100644 --- a/spacy/lang/pl/tokenizer_exceptions.py +++ b/spacy/lang/pl/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # encoding: utf8 from __future__ import unicode_literals -from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN +from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS _exc = {} @@ -19,5 +19,7 @@ for exc_data in [ for orth in ["w.", "r."]: _exc[orth] = [{ORTH: orth}] +for orth in PL_BASE_EXCEPTIONS: + _exc[orth] = [{ORTH: orth}] TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 70bd036f9..67b78e558 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .morph_rules import MORPH_RULES from .lemmatizer import LEMMA_RULES, LOOKUP @@ -22,6 +23,7 @@ class SwedishDefaults(Language.Defaults): ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) morph_rules = MORPH_RULES + tag_map = TAG_MAP infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES stop_words = STOP_WORDS diff --git a/spacy/lang/sv/tag_map.py b/spacy/lang/sv/tag_map.py new file mode 100644 index 000000000..9fe4d6872 --- /dev/null +++ b/spacy/lang/sv/tag_map.py @@ -0,0 +1,169 @@ +# coding: utf8 + +""" +Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html +for https://github.com/UniversalDependencies/UD_Swedish-Talbanken +""" + +from __future__ import unicode_literals + +from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X, VERB +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX + +TAG_MAP = { + 'AB': { POS: ADV }, # inte, också, så, bara, nu + 'AB|AN': { POS: ADV }, # t.ex., ca, t_ex, bl.a., s_k + 'AB|KOM': { POS: ADV }, # mer, tidigare, mindre, vidare, mera + 'AB|POS': { POS: ADV }, # mycket, helt, ofta, länge, långt + 'AB|SMS': { POS: ADV }, # över-, in- + 'AB|SUV': { POS: ADV }, # minst, mest, högst, främst, helst + 'DT|MAS|SIN|DEF': { POS: DET }, + 'DT|MAS|SIN|IND': { POS: DET }, + 'DT|NEU|SIN|DEF': { POS: DET }, # det, detta + 'DT|NEU|SIN|IND': { POS: DET }, # ett, något, inget, vart, vartannat + 'DT|NEU|SIN|IND/DEF': { POS: DET }, # allt + 'DT|UTR/NEU|PLU|DEF': { POS: DET }, # de, dessa, bägge, dom + 'DT|UTR/NEU|PLU|IND': { POS: DET }, # några, inga + 'DT|UTR/NEU|PLU|IND/DEF': { POS: DET }, # alla + 'DT|UTR/NEU|SIN/PLU|IND': { POS: DET }, # samma + 'DT|UTR/NEU|SIN|DEF': { POS: DET }, # vardera + 'DT|UTR/NEU|SIN|IND': { POS: DET }, # varje, varenda + 'DT|UTR|SIN|DEF': { POS: DET }, # den, denna + 'DT|UTR|SIN|IND': { POS: DET }, # en, någon, ingen, var, varannan + 'DT|UTR|SIN|IND/DEF': { POS: DET }, # all + 'HA': { POS: ADV }, # när, där, hur, som, då + 'HD|NEU|SIN|IND': { POS: DET }, # vilket + 'HD|UTR/NEU|PLU|IND': { POS: DET }, # vilka + 'HD|UTR|SIN|IND': { POS: DET }, # vilken + 'HP|-|-|-': { POS: PRON }, # som + 'HP|NEU|SIN|IND': { POS: PRON }, # vad, vilket + 'HP|NEU|SIN|IND|SMS': { POS: PRON }, + 'HP|UTR/NEU|PLU|IND': { POS: PRON }, # vilka + 'HP|UTR|SIN|IND': { POS: PRON }, # vilken, vem + 'HS|DEF': { POS: DET }, # vars, vilkas, Vems + 'IE': { POS: PART }, # att + 'IN': { POS: INTJ }, # Jo, ja, nej, fan, visst + 'JJ|AN': { POS: ADJ }, # ev, S:t, Kungl, Kungl., Teol + 'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: ADJ }, # äldres + 'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # större, högre, mindre, bättre, äldre + 'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS': { POS: ADJ }, + 'JJ|POS|MAS|SIN|DEF|GEN': { POS: ADJ }, # enskildes, sjukes, andres + 'JJ|POS|MAS|SIN|DEF|NOM': { POS: ADJ }, # enskilde, sjuke, andre, unge, ene + 'JJ|POS|NEU|SIN|IND/DEF|NOM': { POS: ADJ }, # eget + 'JJ|POS|NEU|SIN|IND|GEN': { POS: ADJ }, + 'JJ|POS|NEU|SIN|IND|NOM': { POS: ADJ }, # annat, svårt, möjligt, nytt, sådant + 'JJ|POS|UTR/NEU|PLU|IND/DEF|GEN': { POS: ADJ }, # ogiftas, ungas, frånskildas, efterkommandes, färgblindas + 'JJ|POS|UTR/NEU|PLU|IND/DEF|NOM': { POS: ADJ }, # olika, andra, många, stora, vissa + 'JJ|POS|UTR/NEU|PLU|IND|NOM': { POS: ADJ }, # flera, sådana, fler, få, samtliga + 'JJ|POS|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ }, + 'JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # bra, ena, enda, nästa, ringa + 'JJ|POS|UTR/NEU|SIN|DEF|GEN': { POS: ADJ }, + 'JJ|POS|UTR/NEU|SIN|DEF|NOM': { POS: ADJ }, # hela, nya, andra, svenska, ekonomiska + 'JJ|POS|UTR|-|-|SMS': { POS: ADJ }, # fri-, låg-, sexual- + 'JJ|POS|UTR|SIN|IND/DEF|NOM': { POS: ADJ }, # egen + 'JJ|POS|UTR|SIN|IND|GEN': { POS: ADJ }, # enskilds + 'JJ|POS|UTR|SIN|IND|NOM': { POS: ADJ }, # stor, annan, själv, sådan, viss + 'JJ|SUV|MAS|SIN|DEF|GEN': { POS: ADJ }, + 'JJ|SUV|MAS|SIN|DEF|NOM': { POS: ADJ }, # störste, främste, äldste, minste + 'JJ|SUV|UTR/NEU|PLU|DEF|NOM': { POS: ADJ }, # flesta + 'JJ|SUV|UTR/NEU|PLU|IND|NOM': { POS: ADJ }, + 'JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM': { POS: ADJ }, # bästa, största, närmaste, viktigaste, högsta + 'JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ }, # störst, bäst, tidigast, högst, fattigast + 'KN': { POS: CCONJ }, # och, eller, som, än, men + 'KN|AN': { POS: CCONJ }, + 'MAD': { POS: PUNCT }, # ., ?, :, !, ... + 'MID': { POS: PUNCT }, # ,, -, :, *, ; + 'NN|-|-|-|-': { POS: NOUN }, # godo, fjol, fullo, somras, måtto + 'NN|AN': { POS: NOUN }, # kr, %, s., dr, kap. + 'NN|NEU|-|-|-': { POS: NOUN }, + 'NN|NEU|-|-|SMS': { POS: NOUN }, # yrkes-, barn-, hem-, fack-, vatten- + 'NN|NEU|PLU|DEF|GEN': { POS: NOUN }, # barnens, årens, u-ländernas, företagens, århundradenas + 'NN|NEU|PLU|DEF|NOM': { POS: NOUN }, # barnen, u-länderna, åren, länderna, könen + 'NN|NEU|PLU|IND|GEN': { POS: NOUN }, # slags, års, barns, länders, tusentals + 'NN|NEU|PLU|IND|NOM': { POS: NOUN }, # barn, år, fall, länder, problem + 'NN|NEU|SIN|DEF|GEN': { POS: NOUN }, # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets + 'NN|NEU|SIN|DEF|NOM': { POS: NOUN }, # äktenskapet, samhället, barnet, stället, hemmet + 'NN|NEU|SIN|IND|GEN': { POS: NOUN }, # års, slags, lands, havs, företags + 'NN|NEU|SIN|IND|NOM': { POS: NOUN }, # år, arbete, barn, sätt, äktenskap + 'NN|SMS': { POS: NOUN }, # PCB-, Syd- + 'NN|UTR|-|-|-': { POS: NOUN }, # dags, rätta + 'NN|UTR|-|-|SMS': { POS: NOUN }, # far-, kibbutz-, röntgen-, barna-, hälso- + 'NN|UTR|PLU|DEF|GEN': { POS: NOUN }, # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas + 'NN|UTR|PLU|DEF|NOM': { POS: NOUN }, # kvinnorna, föräldrarna, makarna, männen, hyrorna + 'NN|UTR|PLU|IND|GEN': { POS: NOUN }, # människors, kvinnors, dagars, tiders, månaders + 'NN|UTR|PLU|IND|NOM': { POS: NOUN }, # procent, människor, kvinnor, miljoner, kronor + 'NN|UTR|SIN|DEF|GEN': { POS: NOUN }, # kvinnans, världens, familjens, dagens, jordens + 'NN|UTR|SIN|DEF|NOM': { POS: NOUN }, # familjen, kvinnan, mannen, världen, skolan + 'NN|UTR|SIN|IND|GEN': { POS: NOUN }, # sorts, medelålders, makes, kvinnas, veckas + 'NN|UTR|SIN|IND|NOM': { POS: NOUN }, # del, tid, dag, fråga, man + 'PAD': { POS: PUNCT }, # , ), ( + 'PC|AN': { POS: VERB }, + 'PC|PRF|MAS|SIN|DEF|GEN': { POS: VERB }, # avlidnes + 'PC|PRF|MAS|SIN|DEF|NOM': { POS: VERB }, + 'PC|PRF|NEU|SIN|IND|NOM': { POS: VERB }, # taget, sett, särskilt, förbjudet, ökat + 'PC|PRF|UTR/NEU|PLU|IND/DEF|GEN': { POS: VERB }, # försäkrades, anställdas + 'PC|PRF|UTR/NEU|PLU|IND/DEF|NOM': { POS: VERB }, # särskilda, gifta, ökade, handikappade, skilda + 'PC|PRF|UTR/NEU|SIN|DEF|GEN': { POS: VERB }, + 'PC|PRF|UTR/NEU|SIN|DEF|NOM': { POS: VERB }, # ökade, gifta, nämnda, nedärvda, dolda + 'PC|PRF|UTR|SIN|IND|GEN': { POS: VERB }, + 'PC|PRF|UTR|SIN|IND|NOM': { POS: VERB }, # särskild, ökad, beredd, gift, oförändrad + 'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: VERB }, # studerandes, sammanboendes, dubbelarbetandes + 'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: VERB }, # följande, beroende, nuvarande, motsvarande, liknande + 'PL': { POS: PART }, # ut, upp, in, till, med + 'PL|SMS': { POS: PART }, + 'PM': { POS: PROPN }, # F, N, Liechtenstein, Danmark, DK + 'PM|GEN': { POS: PROPN }, # Sveriges, EEC:s, Guds, Stockholms, Kristi + 'PM|NOM': { POS: PROPN }, # Sverige, EEC, Stockholm, USA, ATP + 'PM|SMS': { POS: PROPN }, # Göteborgs-, Nord-, Väst- + 'PN|MAS|SIN|DEF|SUB/OBJ': { POS: PRON }, # denne + 'PN|NEU|SIN|DEF|SUB/OBJ': { POS: PRON }, # det, detta, detsamma + 'PN|NEU|SIN|IND|SUB/OBJ': { POS: PRON }, # något, allt, mycket, annat, ingenting + 'PN|UTR/NEU|PLU|DEF|OBJ': { POS: PRON }, # dem, varandra, varann + 'PN|UTR/NEU|PLU|DEF|SUB': { POS: PRON }, # de, bägge + 'PN|UTR/NEU|PLU|DEF|SUB/OBJ': { POS: PRON }, # dessa, dom, båda, den, bådadera + 'PN|UTR/NEU|PLU|IND|SUB/OBJ': { POS: PRON }, # andra, alla, många, sådana, några + 'PN|UTR/NEU|SIN/PLU|DEF|OBJ': { POS: PRON }, # sig, sej + 'PN|UTR|PLU|DEF|OBJ': { POS: PRON }, # oss, er, eder + 'PN|UTR|PLU|DEF|SUB': { POS: PRON }, # vi + 'PN|UTR|SIN|DEF|OBJ': { POS: PRON }, # dig, mig, henne, honom, Er + 'PN|UTR|SIN|DEF|SUB': { POS: PRON }, # du, han, hon, jag, ni + 'PN|UTR|SIN|DEF|SUB/OBJ': { POS: PRON }, # den, denna, densamma + 'PN|UTR|SIN|IND|SUB': { POS: PRON }, # man + 'PN|UTR|SIN|IND|SUB/OBJ': { POS: PRON }, # en, var, någon, ingen, Varannan + 'PP': { POS: ADP }, # i, av, på, för, till + 'PP|AN': { POS: ADP }, # f + 'PS|AN': { POS: DET }, + 'PS|NEU|SIN|DEF': { POS: DET }, # sitt, vårt, ditt, mitt, ert + 'PS|UTR/NEU|PLU|DEF': { POS: DET }, # sina, våra, dina, mina + 'PS|UTR/NEU|SIN/PLU|DEF': { POS: DET }, # deras, dess, hans, hennes, varandras + 'PS|UTR|SIN|DEF': { POS: DET }, # sin, vår, din, min, er + 'RG': { POS: NUM }, # 2, 17, 20, 1, 18 + 'RG|GEN': { POS: NUM }, + 'RG|MAS|SIN|DEF|NOM': { POS: NUM }, + 'RG|NEU|SIN|IND|NOM': { POS: NUM }, # ett + 'RG|NOM': { POS: NUM }, # två, tre, 1, 20, 2 + 'RG|SMS': { POS: NUM }, # ett-, 1950-, två-, tre-, 1700- + 'RG|UTR/NEU|SIN|DEF|NOM': { POS: NUM }, + 'RG|UTR|SIN|IND|NOM': { POS: NUM }, # en + 'RO|MAS|SIN|IND/DEF|GEN': { POS: ADJ }, + 'RO|MAS|SIN|IND/DEF|NOM': { POS: ADJ }, # förste + 'RO|GEN': { POS: ADJ }, + 'RO|NOM': { POS: ADJ }, # första, andra, tredje, fjärde, femte + 'SN': { POS: SCONJ }, # att, om, innan, eftersom, medan + 'UO': { POS: X }, # companionship, vice, versa, family, capita + 'VB|AN': { POS: VERB }, # jfr + 'VB|IMP|AKT': { POS: VERB }, # se, Diskutera, låt, Läs, Gå + 'VB|IMP|SFO': { POS: VERB }, # tas + 'VB|INF|AKT': { POS: VERB }, # vara, få, ha, bli, kunna + 'VB|INF|SFO': { POS: VERB }, # användas, finnas, göras, tas, ses + 'VB|KON|PRS|AKT': { POS: VERB }, # vare, Gånge + 'VB|KON|PRT|AKT': { POS: VERB }, # vore, finge + 'VB|KON|PRT|SFO': { POS: VERB }, + 'VB|PRS|AKT': { POS: VERB }, # är, har, kan, får, måste + 'VB|PRS|SFO': { POS: VERB }, # finns, kallas, behövs, beräknas, används + 'VB|PRT|AKT': { POS: VERB }, # skulle, var, hade, kunde, fick + 'VB|PRT|SFO': { POS: VERB }, # fanns, gjordes, höjdes, användes, infördes + 'VB|SMS': { POS: VERB }, # läs- + 'VB|SUP|AKT': { POS: VERB }, # varit, fått, blivit, haft, kommit + 'VB|SUP|SFO': { POS: VERB } # nämnts, gjorts, förändrats, sagts, framhållits +} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 028b0ae62..5032dd071 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -128,6 +128,11 @@ def ca_tokenizer(): return get_lang_class("ca").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def pl_tokenizer(): + return util.get_lang_class("pl").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def tt_tokenizer(): return get_lang_class("tt").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/pl/test_text.py b/spacy/tests/lang/pl/test_text.py new file mode 100644 index 000000000..5a19f3c99 --- /dev/null +++ b/spacy/tests/lang/pl/test_text.py @@ -0,0 +1,17 @@ +# coding: utf-8 +"""Words like numbers are recognized correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text,match', [ + ('10', True), ('1', True), ('10,000', True), ('10,00', True), + ('jeden', True), ('dwa', True), ('milion', True), + ('pies', False), (',', False), ('1/2', True)]) +def test_lex_attrs_like_number(pl_tokenizer, text, match): + tokens = pl_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/pl/test_tokenizer.py b/spacy/tests/lang/pl/test_tokenizer.py new file mode 100644 index 000000000..27eb9af1c --- /dev/null +++ b/spacy/tests/lang/pl/test_tokenizer.py @@ -0,0 +1,60 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +DOT_TESTS = [ + ('tel.', ['tel.']), + ('np.', ['np.']), + ('godz. 21:37', ['godz.', '21:37']), + ('inż.', ['inż.']), + ('gosp.-polit.', ['gosp.-polit.']), + ('ppoż', ['ppoż']), + ('płn', ['płn']), + ('ul.', ['ul.']), + ('jw.', ['jw.']), + ('itd.', ['itd.']), + ('cdn.', ['cdn.']), + ('itp.', ['itp.']), + ('10,- zł', ['10,-', 'zł']), + ('0 zł 99 gr', ['0', 'zł', '99', 'gr']), + ('0,99 rub.', ['0,99', 'rub.']), + ('dol.', ['dol.']), + ('1000 m n.p.m.', ['1000', 'm', 'n.p.m.']), + ('m.in.', ['m.in.']), + ('p.n.e.', ['p.n.e.']), + ('Sz.P.', ['Sz.P.']), + ('p.o.', ['p.o.']), + ('k.o.', ['k.o.']), + ('m.st.', ['m.st.']), + ('dra.', ['dra', '.']), + ('pp.', ['pp.']), + ('oo.', ['oo.']) +] + +HYPHEN_TESTS = [ + ('5-fluoropentylo-3-pirydynyloindol', ['5-fluoropentylo-3-pirydynyloindol']), + ('NESS-040C5', ['NESS-040C5']), + ('JTE-7-31', ['JTE-7-31']), + ('BAY-59-3074', ['BAY-59-3074']), + ('BAY-38-7271', ['BAY-38-7271']), + ('STS-135', ['STS-135']), + ('5F-PB-22', ['5F-PB-22']), + ('cztero-', ['cztero-']), + ('jedno-', ['jedno-']), + ('dwu-', ['dwu-']), + ('trzy-', ['trzy-']), + ('b-adoratorzy', ['b-adoratorzy']), + ('2-3-4 drzewa', ['2-3-4', 'drzewa']), + ('b-drzewa', ['b-drzewa']) +] + + +TESTCASES = DOT_TESTS + HYPHEN_TESTS + + +@pytest.mark.parametrize('text,expected_tokens', TESTCASES) +def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens): + tokens = pl_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list