From 1448ad100cfa3642904ca5e23426f37fca13905e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Gizi=C5=84ski?= Date: Fri, 8 Feb 2019 04:27:21 +0100 Subject: [PATCH] Improved polish tokenizer and stop words. (#2974) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Improved stop words list * Removed some wrong stop words form list * Improved stop words list * Removed some wrong stop words form list * Improved Polish Tokenizer (#38) * Add tests for polish tokenizer * Add polish tokenizer exceptions * Don't split any words containing hyphens * Fix test case with wrong model answer * Remove commented out line of code until better solution is found * Add source srx' license * Rename exception_list.py to match spaCy conventionality * Add a brief explanation of where the exception list comes from * Add newline after reach exception * Rename COPYING.txt to LICENSE * Delete old files * Add header to the license * Agreements signed * Stanisław Giziński agreement * Krzysztof Kowalczyk - signed agreement * Mateusz Olko agreement * Add DoomCoder's contributor agreement * Improve like number checking in polish lang * like num tests added * all from SI system added * Final licence and removed splitting exceptions * Added polish stop words to LEX_ATTRA * Add encoding info to pl tokenizer exceptions --- .github/contributors/DoomCoder.md | 106 ++ .github/contributors/Gizzio.md | 106 ++ .github/contributors/MateuszOlko.md | 106 ++ .github/contributors/kowaalczyk.md | 106 ++ spacy/lang/pl/__init__.py | 4 + spacy/lang/pl/_tokenizer_exceptions_list.py | 1441 +++++++++++++++++++ spacy/lang/pl/lex_attrs.py | 14 +- spacy/lang/pl/polish_srx_rules_LICENSE.txt | 23 + spacy/lang/pl/punctuation.py | 14 + spacy/lang/pl/stop_words.py | 82 +- spacy/lang/pl/tokenizer_exceptions.py | 4 +- spacy/tests/conftest.py | 3 + spacy/tests/lang/pl/test_text.py | 17 + spacy/tests/lang/pl/test_tokenizer.py | 60 + 14 files changed, 2055 insertions(+), 31 deletions(-) create mode 100644 .github/contributors/DoomCoder.md create mode 100644 .github/contributors/Gizzio.md create mode 100644 .github/contributors/MateuszOlko.md create mode 100644 .github/contributors/kowaalczyk.md create mode 100644 spacy/lang/pl/_tokenizer_exceptions_list.py create mode 100644 spacy/lang/pl/polish_srx_rules_LICENSE.txt create mode 100644 spacy/lang/pl/punctuation.py create mode 100644 spacy/tests/lang/pl/test_text.py create mode 100644 spacy/tests/lang/pl/test_tokenizer.py diff --git a/.github/contributors/DoomCoder.md b/.github/contributors/DoomCoder.md new file mode 100644 index 000000000..0b9938bdc --- /dev/null +++ b/.github/contributors/DoomCoder.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Piotr Książek | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 22.11.2018 | +| GitHub username | DoomCoder | +| Website (optional) | | diff --git a/.github/contributors/Gizzio.md b/.github/contributors/Gizzio.md new file mode 100644 index 000000000..b9ca424d9 --- /dev/null +++ b/.github/contributors/Gizzio.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Stanisław Giziński | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 21.11.2018 | +| GitHub username | Gizzio | +| Website (optional) | | \ No newline at end of file diff --git a/.github/contributors/MateuszOlko.md b/.github/contributors/MateuszOlko.md new file mode 100644 index 000000000..04467c749 --- /dev/null +++ b/.github/contributors/MateuszOlko.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Mateusz Olko | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 22.11.2018 | +| GitHub username | MateuszOlko | +| Website (optional) | | diff --git a/.github/contributors/kowaalczyk.md b/.github/contributors/kowaalczyk.md new file mode 100644 index 000000000..c367c913d --- /dev/null +++ b/.github/contributors/kowaalczyk.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name |Krzysztof Kowalczyk | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date |22.11.2018 | +| GitHub username |kowaalczyk | +| Website (optional) |kowaalczyk.pl | diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 80011f9d8..901a36153 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -13,9 +15,11 @@ from ...util import update_exc, add_lookups class PolishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: 'pl' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + infixes = tuple(TOKENIZER_INFIXES) stop_words = STOP_WORDS diff --git a/spacy/lang/pl/_tokenizer_exceptions_list.py b/spacy/lang/pl/_tokenizer_exceptions_list.py new file mode 100644 index 000000000..ae8806796 --- /dev/null +++ b/spacy/lang/pl/_tokenizer_exceptions_list.py @@ -0,0 +1,1441 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals + +# The following list consists of: +# - exceptions generated from polish_srx_rules [1] +# (https://github.com/milekpl/polish_srx_rules) +# - abbreviations parsed from Wikipedia +# - some manually added exceptions +# +# [1] M. Miłkowski and J. Lipski, +# "Using SRX Standard for Sentence Segmentation," in LTC 2009, +# Lecture Notes in Artificial Intelligence 6562, +# Z. Vetulani, Ed. Berlin Heidelberg: Springer-Verlag, 2011, pp. 172–182. +PL_BASE_EXCEPTIONS = ['0.', +'1.', +'10.', +'2.', +'3.', +'4.', +'5.', +'6.', +'7.', +'8.', +'9.', +'A.A.', +'A.B.', +'A.C.', +'A.D.', +'A.E.', +'A.F.', +'A.G.', +'A.H.', +'A.I.', +'A.J.', +'A.K.', +'A.L.', +'A.M.', +'A.N.', +'A.O.', +'A.P.', +'A.R.', +'A.S.', +'A.T.', +'A.U.', +'A.W.', +'A.Y.', +'A.Z.', +'A.Ó.', +'A.Ą.', +'A.Ć.', +'A.Ę.', +'A.Ł.', +'A.Ń.', +'A.Ś.', +'A.Ź.', +'A.Ż.', +'Ad.', +'Adw.', +'Al.', +'Art.', +'B.A.', +'B.B.', +'B.C.', +'B.D.', +'B.E.', +'B.F.', +'B.G.', +'B.H.', +'B.I.', +'B.J.', +'B.K.', +'B.L.', +'B.M.', +'B.N.', +'B.O.', +'B.P.', +'B.R.', +'B.S.', +'B.T.', +'B.U.', +'B.W.', +'B.Y.', +'B.Z.', +'B.Ó.', +'B.Ą.', +'B.Ć.', +'B.Ę.', +'B.Ł.', +'B.Ń.', +'B.Ś.', +'B.Ź.', +'B.Ż.', +'D.A.', +'D.B.', +'D.C.', +'D.D.', +'D.E.', +'D.F.', +'D.G.', +'D.H.', +'D.I.', +'D.J.', +'D.K.', +'D.L.', +'D.M.', +'D.N.', +'D.O.', +'D.P.', +'D.R.', +'D.S.', +'D.T.', +'D.U.', +'D.W.', +'D.Y.', +'D.Z.', +'D.Ó.', +'D.Ą.', +'D.Ć.', +'D.Ę.', +'D.Ł.', +'D.Ń.', +'D.Ś.', +'D.Ź.', +'D.Ż.', +'Dh.', +'Doc.', +'Dr.', +'Dyr.', +'Dyw.', +'Dz.U.', +'E.A.', +'E.B.', +'E.C.', +'E.D.', +'E.E.', +'E.F.', +'E.G.', +'E.H.', +'E.I.', +'E.J.', +'E.K.', +'E.L.', +'E.M.', +'E.N.', +'E.O.', +'E.P.', +'E.R.', +'E.S.', +'E.T.', +'E.U.', +'E.W.', +'E.Y.', +'E.Z.', +'E.Ó.', +'E.Ą.', +'E.Ć.', +'E.Ę.', +'E.Ł.', +'E.Ń.', +'E.Ś.', +'E.Ź.', +'E.Ż.', +'F.A.', +'F.B.', +'F.C.', +'F.D.', +'F.E.', +'F.F.', +'F.G.', +'F.H.', +'F.I.', +'F.J.', +'F.K.', +'F.L.', +'F.M.', +'F.N.', +'F.O.', +'F.P.', +'F.R.', +'F.S.', +'F.T.', +'F.U.', +'F.W.', +'F.Y.', +'F.Z.', +'F.Ó.', +'F.Ą.', +'F.Ć.', +'F.Ę.', +'F.Ł.', +'F.Ń.', +'F.Ś.', +'F.Ź.', +'F.Ż.', +'G.A.', +'G.B.', +'G.C.', +'G.D.', +'G.E.', +'G.F.', +'G.G.', +'G.H.', +'G.I.', +'G.J.', +'G.K.', +'G.L.', +'G.M.', +'G.N.', +'G.O.', +'G.P.', +'G.R.', +'G.S.', +'G.T.', +'G.U.', +'G.W.', +'G.Y.', +'G.Z.', +'G.Ó.', +'G.Ą.', +'G.Ć.', +'G.Ę.', +'G.Ł.', +'G.Ń.', +'G.Ś.', +'G.Ź.', +'G.Ż.', +'H.A.', +'H.B.', +'H.C.', +'H.D.', +'H.E.', +'H.F.', +'H.G.', +'H.H.', +'H.I.', +'H.J.', +'H.K.', +'H.L.', +'H.M.', +'H.N.', +'H.O.', +'H.P.', +'H.R.', +'H.S.', +'H.T.', +'H.U.', +'H.W.', +'H.Y.', +'H.Z.', +'H.Ó.', +'H.Ą.', +'H.Ć.', +'H.Ę.', +'H.Ł.', +'H.Ń.', +'H.Ś.', +'H.Ź.', +'H.Ż.', +'Hr.', +'I.A.', +'I.B.', +'I.C.', +'I.D.', +'I.E.', +'I.F.', +'I.G.', +'I.H.', +'I.I.', +'I.J.', +'I.K.', +'I.L.', +'I.M.', +'I.N.', +'I.O.', +'I.P.', +'I.R.', +'I.S.', +'I.T.', +'I.U.', +'I.W.', +'I.Y.', +'I.Z.', +'I.Ó.', +'I.Ą.', +'I.Ć.', +'I.Ę.', +'I.Ł.', +'I.Ń.', +'I.Ś.', +'I.Ź.', +'I.Ż.', +'Inż.', +'J.A.', +'J.B.', +'J.C.', +'J.D.', +'J.E.', +'J.F.', +'J.G.', +'J.H.', +'J.I.', +'J.J.', +'J.K.', +'J.L.', +'J.M.', +'J.N.', +'J.O.', +'J.P.', +'J.R.', +'J.S.', +'J.T.', +'J.U.', +'J.W.', +'J.Y.', +'J.Z.', +'J.Ó.', +'J.Ą.', +'J.Ć.', +'J.Ę.', +'J.Ł.', +'J.Ń.', +'J.Ś.', +'J.Ź.', +'J.Ż.', +'K.A.', +'K.B.', +'K.C.', +'K.D.', +'K.E.', +'K.F.', +'K.G.', +'K.H.', +'K.I.', +'K.J.', +'K.K.', +'K.L.', +'K.M.', +'K.N.', +'K.O.', +'K.P.', +'K.R.', +'K.S.', +'K.T.', +'K.U.', +'K.W.', +'K.Y.', +'K.Z.', +'K.Ó.', +'K.Ą.', +'K.Ć.', +'K.Ę.', +'K.Ł.', +'K.Ń.', +'K.Ś.', +'K.Ź.', +'K.Ż.', +'Ks.', +'L.A.', +'L.B.', +'L.C.', +'L.D.', +'L.E.', +'L.F.', +'L.G.', +'L.H.', +'L.I.', +'L.J.', +'L.K.', +'L.L.', +'L.M.', +'L.N.', +'L.O.', +'L.P.', +'L.R.', +'L.S.', +'L.T.', +'L.U.', +'L.W.', +'L.Y.', +'L.Z.', +'L.Ó.', +'L.Ą.', +'L.Ć.', +'L.Ę.', +'L.Ł.', +'L.Ń.', +'L.Ś.', +'L.Ź.', +'L.Ż.', +'Lek.', +'M.A.', +'M.B.', +'M.C.', +'M.D.', +'M.E.', +'M.F.', +'M.G.', +'M.H.', +'M.I.', +'M.J.', +'M.K.', +'M.L.', +'M.M.', +'M.N.', +'M.O.', +'M.P.', +'M.R.', +'M.S.', +'M.T.', +'M.U.', +'M.W.', +'M.Y.', +'M.Z.', +'M.Ó.', +'M.Ą.', +'M.Ć.', +'M.Ę.', +'M.Ł.', +'M.Ń.', +'M.Ś.', +'M.Ź.', +'M.Ż.', +'Mat.', +'Mec.', +'Mojż.', +'N.A.', +'N.B.', +'N.C.', +'N.D.', +'N.E.', +'N.F.', +'N.G.', +'N.H.', +'N.I.', +'N.J.', +'N.K.', +'N.L.', +'N.M.', +'N.N.', +'N.O.', +'N.P.', +'N.R.', +'N.S.', +'N.T.', +'N.U.', +'N.W.', +'N.Y.', +'N.Z.', +'N.Ó.', +'N.Ą.', +'N.Ć.', +'N.Ę.', +'N.Ł.', +'N.Ń.', +'N.Ś.', +'N.Ź.', +'N.Ż.', +'Na os.', +'Nadkom.', +'Najśw.', +'Nb.', +'Np.', +'O.A.', +'O.B.', +'O.C.', +'O.D.', +'O.E.', +'O.F.', +'O.G.', +'O.H.', +'O.I.', +'O.J.', +'O.K.', +'O.L.', +'O.M.', +'O.N.', +'O.O.', +'O.P.', +'O.R.', +'O.S.', +'O.T.', +'O.U.', +'O.W.', +'O.Y.', +'O.Z.', +'O.Ó.', +'O.Ą.', +'O.Ć.', +'O.Ę.', +'O.Ł.', +'O.Ń.', +'O.Ś.', +'O.Ź.', +'O.Ż.', +'OO.', +'Oo.', +'P.A.', +'P.B.', +'P.C.', +'P.D.', +'P.E.', +'P.F.', +'P.G.', +'P.H.', +'P.I.', +'P.J.', +'P.K.', +'P.L.', +'P.M.', +'P.N.', +'P.O.', +'P.P.', +'P.R.', +'P.S.', +'P.T.', +'P.U.', +'P.W.', +'P.Y.', +'P.Z.', +'P.Ó.', +'P.Ą.', +'P.Ć.', +'P.Ę.', +'P.Ł.', +'P.Ń.', +'P.Ś.', +'P.Ź.', +'P.Ż.', +'Podkom.', +'Przyp.', +'Ps.', +'Pt.', +'Płk.', +'R.A.', +'R.B.', +'R.C.', +'R.D.', +'R.E.', +'R.F.', +'R.G.', +'R.H.', +'R.I.', +'R.J.', +'R.K.', +'R.L.', +'R.M.', +'R.N.', +'R.O.', +'R.P.', +'R.R.', +'R.S.', +'R.T.', +'R.U.', +'R.W.', +'R.Y.', +'R.Z.', +'R.Ó.', +'R.Ą.', +'R.Ć.', +'R.Ę.', +'R.Ł.', +'R.Ń.', +'R.Ś.', +'R.Ź.', +'R.Ż.', +'Red.', +'Reż.', +'Ryc.', +'Rys.', +'S.A.', +'S.B.', +'S.C.', +'S.D.', +'S.E.', +'S.F.', +'S.G.', +'S.H.', +'S.I.', +'S.J.', +'S.K.', +'S.L.', +'S.M.', +'S.N.', +'S.O.', +'S.P.', +'S.R.', +'S.S.', +'S.T.', +'S.U.', +'S.W.', +'S.Y.', +'S.Z.', +'S.Ó.', +'S.Ą.', +'S.Ć.', +'S.Ę.', +'S.Ł.', +'S.Ń.', +'S.Ś.', +'S.Ź.', +'S.Ż.', +'Sp.', +'Spółdz.', +'Stow.', +'Stoł.', +'Sz.P.', +'Szer.', +'T.A.', +'T.B.', +'T.C.', +'T.D.', +'T.E.', +'T.F.', +'T.G.', +'T.H.', +'T.I.', +'T.J.', +'T.K.', +'T.L.', +'T.M.', +'T.N.', +'T.O.', +'T.P.', +'T.R.', +'T.S.', +'T.T.', +'T.U.', +'T.W.', +'T.Y.', +'T.Z.', +'T.Ó.', +'T.Ą.', +'T.Ć.', +'T.Ę.', +'T.Ł.', +'T.Ń.', +'T.Ś.', +'T.Ź.', +'T.Ż.', +'Tow.', +'Tzw.', +'U.A.', +'U.B.', +'U.C.', +'U.D.', +'U.E.', +'U.F.', +'U.G.', +'U.H.', +'U.I.', +'U.J.', +'U.K.', +'U.L.', +'U.M.', +'U.N.', +'U.O.', +'U.P.', +'U.R.', +'U.S.', +'U.T.', +'U.U.', +'U.W.', +'U.Y.', +'U.Z.', +'U.Ó.', +'U.Ą.', +'U.Ć.', +'U.Ę.', +'U.Ł.', +'U.Ń.', +'U.Ś.', +'U.Ź.', +'U.Ż.', +'W.A.', +'W.B.', +'W.C.', +'W.D.', +'W.E.', +'W.F.', +'W.G.', +'W.H.', +'W.I.', +'W.J.', +'W.K.', +'W.L.', +'W.M.', +'W.N.', +'W.O.', +'W.P.', +'W.R.', +'W.S.', +'W.T.', +'W.U.', +'W.W.', +'W.Y.', +'W.Z.', +'W.Ó.', +'W.Ą.', +'W.Ć.', +'W.Ę.', +'W.Ł.', +'W.Ń.', +'W.Ś.', +'W.Ź.', +'W.Ż.', +'Y.A.', +'Y.B.', +'Y.C.', +'Y.D.', +'Y.E.', +'Y.F.', +'Y.G.', +'Y.H.', +'Y.I.', +'Y.J.', +'Y.K.', +'Y.L.', +'Y.M.', +'Y.N.', +'Y.O.', +'Y.P.', +'Y.R.', +'Y.S.', +'Y.T.', +'Y.U.', +'Y.W.', +'Y.Y.', +'Y.Z.', +'Y.Ó.', +'Y.Ą.', +'Y.Ć.', +'Y.Ę.', +'Y.Ł.', +'Y.Ń.', +'Y.Ś.', +'Y.Ź.', +'Y.Ż.', +'Z.A.', +'Z.B.', +'Z.C.', +'Z.D.', +'Z.E.', +'Z.F.', +'Z.G.', +'Z.H.', +'Z.I.', +'Z.J.', +'Z.K.', +'Z.L.', +'Z.M.', +'Z.N.', +'Z.O.', +'Z.P.', +'Z.R.', +'Z.S.', +'Z.T.', +'Z.U.', +'Z.W.', +'Z.Y.', +'Z.Z.', +'Z.Ó.', +'Z.Ą.', +'Z.Ć.', +'Z.Ę.', +'Z.Ł.', +'Z.Ń.', +'Z.Ś.', +'Z.Ź.', +'Z.Ż.', +'Zob.', +'a.', +'ad.', +'adw.', +'afr.', +'ags.', +'akad.', +'al.', +'alb.', +'am.', +'amer.', +'ang.', +'aor.', +'ap.', +'apost.', +'arch.', +'arcyks.', +'art.', +'artyst.', +'asp.', +'astr.', +'aust.', +'austr.', +'austral.', +'b.', +'bałt.', +'bdb.', +'belg.', +'białorus.', +'białost.', +'bm.', +'bot.', +'bp.', +'br.', +'bryg.', +'bryt.', +'bułg.', +'bł.', +'c.b.d.o.', +'c.k.', +'c.o.', +'cbdu.', +'cd.', +'cdn.', +'centr.', +'ces.', +'chem.', +'chir.', +'chiń.', +'chor.', +'chorw.', +'cieśn.', +'cnd.', +'cyg.', +'cyt.', +'cyw.', +'cz.', +'czes.', +'czw.', +'czyt.', +'d.', +'daw.', +'dcn.', +'dekl.', +'demokr.', +'det.', +'dh.', +'diec.', +'dk.', +'dn.', +'doc.', +'doktor h.c.', +'dol.', +'dolnośl.', +'dost.', +'dosł.', +'dot.', +'dr h.c.', +'dr hab.', +'dr.', +'ds.', +'dst.', +'duszp.', +'dypl.', +'dyr.', +'dyw.', +'dł.', +'egz.', +'ekol.', +'ekon.', +'elektr.', +'em.', +'ent.', +'est.', +'europ.', +'ew.', +'fab.', +'farm.', +'fot.', +'fr.', +'franc.', +'g.', +'gastr.', +'gat.', +'gd.', +'gen.', +'geogr.', +'geol.', +'gimn.', +'gm.', +'godz.', +'gorz.', +'gosp.', +'gosp.-polit.', +'gr.', +'gram.', +'grub.', +'górn.', +'głęb.', +'h.c.', +'hab.', +'hist.', +'hiszp.', +'hitl.', +'hm.', +'hot.', +'hr.', +'i in.', +'i s.', +'id.', +'ie.', +'im.', +'in.', +'inż.', +'iron.', +'itd.', +'itp.', +'j.', +'j.a.', +'jez.', +'jn.', +'jw.', +'jwt.', +'k.', +'k.k.', +'k.o.', +'k.p.a.', +'k.p.c.', +'k.r.', +'k.r.o.', +'kard.', +'kark.', +'kasz.', +'kat.', +'katol.', +'kier.', +'kk.', +'kl.', +'kol.', +'kpc.', +'kpt.', +'kr.', +'krak.', +'kryt.', +'ks.', +'książk.', +'kuj.', +'kult.', +'kł.', +'l.', +'laic.', +'lek.', +'lit.', +'lp.', +'lub.', +'m.', +'m.b.', +'m.in.', +'m.p.', +'m.st.', +'mar.', +'maz.', +'małop.', +'mec.', +'med.', +'mgr.', +'min.', +'mn.', +'mn.w.', +'muz.', +'mł.', +'n.', +'n.e.', +'n.p.m.', +'n.p.u.', +'na os.', +'nadkom.', +'najśw.', +'nb.', +'niedz.', +'niem.', +'norw.', +'np.', +'nt.', +'nż.', +'o s.', +'o.', +'oO.', +'ob.', +'odc.', +'odp.', +'ok.', +'oo.', +'op.', +'os.', +'p.', +'p.a.', +'p.f.', +'p.f.v.', +'p.n.e.', +'p.o.', +'p.p.', +'p.p.m.', +'p.r.', +'p.r.v.', +'phm.', +'pie.', +'pl.', +'pn.', +'pocz.', +'pod.', +'podgat.', +'podkarp.', +'podkom.', +'poet.', +'poj.', +'pok.', +'pol.', +'pom.', +'pon.', +'poprz.', +'por.', +'port.', +'posp.', +'pow.', +'poz.', +'poł.', +'pp.', +'ppanc.', +'ppor.', +'ppoż.', +'prawdop.', +'proc.', +'prof.', +'prok.', +'przed Chr.', +'przyp.', +'ps.', +'pseud.', +'pt.', +'pw.', +'półn.', +'płd.', +'płk.', +'płn.', +'r.', +'r.ż.', +'red.', +'reż.', +'ros.', +'rozdz.', +'rtg.', +'rtm.', +'rub.', +'rum.', +'ryc.', +'rys.', +'rz.', +'s.', +'serb.', +'sierż.', +'skr.', +'sob.', +'sp.', +'społ.', +'spółdz.', +'spółgł.', +'st.', +'st.rus.', +'stow.', +'stoł.', +'str.', +'sud.', +'szczec.', +'szer.', +'szt.', +'szw.', +'szwajc.', +'słow.', +'t.', +'t.j.', +'tatrz.', +'tel.', +'tj.', +'tow.', +'trl.', +'tryb.', +'ts.', +'tur.', +'tys.', +'tzn.', +'tzw.', +'tłum.', +'u s.', +'ub.', +'ukr.', +'ul.', +'up.', +'ur.', +'v.v.', +'vs.', +'w.', +'warm.', +'wlk.', +'wlkp.', +'woj.', +'wroc.', +'ws.', +'wsch.', +'wt.', +'ww.', +'wyb.', +'wyd.', +'wyj.', +'wym.', +'wyst.', +'wył.', +'wyż.', +'wzgl.', +'wędr.', +'węg.', +'wł.', +'x.', +'xx.', +'zach.', +'zagr.', +'zak.', +'zakł.', +'zal.', +'zam.', +'zast.', +'zaw.', +'zazw.', +'zał.', +'zdr.', +'zew.', +'zewn.', +'ziel.', +'zm.', +'zn.', +'zob.', +'zool.', +'zw.', +'ząbk.', +'Ó.A.', +'Ó.B.', +'Ó.C.', +'Ó.D.', +'Ó.E.', +'Ó.F.', +'Ó.G.', +'Ó.H.', +'Ó.I.', +'Ó.J.', +'Ó.K.', +'Ó.L.', +'Ó.M.', +'Ó.N.', +'Ó.O.', +'Ó.P.', +'Ó.R.', +'Ó.S.', +'Ó.T.', +'Ó.U.', +'Ó.W.', +'Ó.Y.', +'Ó.Z.', +'Ó.Ó.', +'Ó.Ą.', +'Ó.Ć.', +'Ó.Ę.', +'Ó.Ł.', +'Ó.Ń.', +'Ó.Ś.', +'Ó.Ź.', +'Ó.Ż.', +'Ą.A.', +'Ą.B.', +'Ą.C.', +'Ą.D.', +'Ą.E.', +'Ą.F.', +'Ą.G.', +'Ą.H.', +'Ą.I.', +'Ą.J.', +'Ą.K.', +'Ą.L.', +'Ą.M.', +'Ą.N.', +'Ą.O.', +'Ą.P.', +'Ą.R.', +'Ą.S.', +'Ą.T.', +'Ą.U.', +'Ą.W.', +'Ą.Y.', +'Ą.Z.', +'Ą.Ó.', +'Ą.Ą.', +'Ą.Ć.', +'Ą.Ę.', +'Ą.Ł.', +'Ą.Ń.', +'Ą.Ś.', +'Ą.Ź.', +'Ą.Ż.', +'Ć.A.', +'Ć.B.', +'Ć.C.', +'Ć.D.', +'Ć.E.', +'Ć.F.', +'Ć.G.', +'Ć.H.', +'Ć.I.', +'Ć.J.', +'Ć.K.', +'Ć.L.', +'Ć.M.', +'Ć.N.', +'Ć.O.', +'Ć.P.', +'Ć.R.', +'Ć.S.', +'Ć.T.', +'Ć.U.', +'Ć.W.', +'Ć.Y.', +'Ć.Z.', +'Ć.Ó.', +'Ć.Ą.', +'Ć.Ć.', +'Ć.Ę.', +'Ć.Ł.', +'Ć.Ń.', +'Ć.Ś.', +'Ć.Ź.', +'Ć.Ż.', +'ćw.', +'ćwicz.', +'Ę.A.', +'Ę.B.', +'Ę.C.', +'Ę.D.', +'Ę.E.', +'Ę.F.', +'Ę.G.', +'Ę.H.', +'Ę.I.', +'Ę.J.', +'Ę.K.', +'Ę.L.', +'Ę.M.', +'Ę.N.', +'Ę.O.', +'Ę.P.', +'Ę.R.', +'Ę.S.', +'Ę.T.', +'Ę.U.', +'Ę.W.', +'Ę.Y.', +'Ę.Z.', +'Ę.Ó.', +'Ę.Ą.', +'Ę.Ć.', +'Ę.Ę.', +'Ę.Ł.', +'Ę.Ń.', +'Ę.Ś.', +'Ę.Ź.', +'Ę.Ż.', +'Ł.A.', +'Ł.B.', +'Ł.C.', +'Ł.D.', +'Ł.E.', +'Ł.F.', +'Ł.G.', +'Ł.H.', +'Ł.I.', +'Ł.J.', +'Ł.K.', +'Ł.L.', +'Ł.M.', +'Ł.N.', +'Ł.O.', +'Ł.P.', +'Ł.R.', +'Ł.S.', +'Ł.T.', +'Ł.U.', +'Ł.W.', +'Ł.Y.', +'Ł.Z.', +'Ł.Ó.', +'Ł.Ą.', +'Ł.Ć.', +'Ł.Ę.', +'Ł.Ł.', +'Ł.Ń.', +'Ł.Ś.', +'Ł.Ź.', +'Ł.Ż.', +'Łuk.', +'łac.', +'łot.', +'łow.', +'Ń.A.', +'Ń.B.', +'Ń.C.', +'Ń.D.', +'Ń.E.', +'Ń.F.', +'Ń.G.', +'Ń.H.', +'Ń.I.', +'Ń.J.', +'Ń.K.', +'Ń.L.', +'Ń.M.', +'Ń.N.', +'Ń.O.', +'Ń.P.', +'Ń.R.', +'Ń.S.', +'Ń.T.', +'Ń.U.', +'Ń.W.', +'Ń.Y.', +'Ń.Z.', +'Ń.Ó.', +'Ń.Ą.', +'Ń.Ć.', +'Ń.Ę.', +'Ń.Ł.', +'Ń.Ń.', +'Ń.Ś.', +'Ń.Ź.', +'Ń.Ż.', +'Ś.A.', +'Ś.B.', +'Ś.C.', +'Ś.D.', +'Ś.E.', +'Ś.F.', +'Ś.G.', +'Ś.H.', +'Ś.I.', +'Ś.J.', +'Ś.K.', +'Ś.L.', +'Ś.M.', +'Ś.N.', +'Ś.O.', +'Ś.P.', +'Ś.R.', +'Ś.S.', +'Ś.T.', +'Ś.U.', +'Ś.W.', +'Ś.Y.', +'Ś.Z.', +'Ś.Ó.', +'Ś.Ą.', +'Ś.Ć.', +'Ś.Ę.', +'Ś.Ł.', +'Ś.Ń.', +'Ś.Ś.', +'Ś.Ź.', +'Ś.Ż.', +'ŚW.', +'Śp.', +'Św.', +'śW.', +'śl.', +'śp.', +'śr.', +'św.', +'Ź.A.', +'Ź.B.', +'Ź.C.', +'Ź.D.', +'Ź.E.', +'Ź.F.', +'Ź.G.', +'Ź.H.', +'Ź.I.', +'Ź.J.', +'Ź.K.', +'Ź.L.', +'Ź.M.', +'Ź.N.', +'Ź.O.', +'Ź.P.', +'Ź.R.', +'Ź.S.', +'Ź.T.', +'Ź.U.', +'Ź.W.', +'Ź.Y.', +'Ź.Z.', +'Ź.Ó.', +'Ź.Ą.', +'Ź.Ć.', +'Ź.Ę.', +'Ź.Ł.', +'Ź.Ń.', +'Ź.Ś.', +'Ź.Ź.', +'Ź.Ż.', +'Ż.A.', +'Ż.B.', +'Ż.C.', +'Ż.D.', +'Ż.E.', +'Ż.F.', +'Ż.G.', +'Ż.H.', +'Ż.I.', +'Ż.J.', +'Ż.K.', +'Ż.L.', +'Ż.M.', +'Ż.N.', +'Ż.O.', +'Ż.P.', +'Ż.R.', +'Ż.S.', +'Ż.T.', +'Ż.U.', +'Ż.W.', +'Ż.Y.', +'Ż.Z.', +'Ż.Ó.', +'Ż.Ą.', +'Ż.Ć.', +'Ż.Ę.', +'Ż.Ł.', +'Ż.Ń.', +'Ż.Ś.', +'Ż.Ź.', +'Ż.Ż.', +'ż.', +'żarg.', +'żart.', +'żyd.', +'żyw.'] diff --git a/spacy/lang/pl/lex_attrs.py b/spacy/lang/pl/lex_attrs.py index e85c2ffab..07875f4d7 100644 --- a/spacy/lang/pl/lex_attrs.py +++ b/spacy/lang/pl/lex_attrs.py @@ -4,14 +4,16 @@ from __future__ import unicode_literals from ...attrs import LIKE_NUM -_num_words = ['zero', 'jeden', 'dwa', 'trzy', 'cztery', 'pięć', 'sześć', - 'siedem', 'osiem', 'dziewięć', 'dziesięć', 'jedenaście', +_num_words = ['zero', 'jeden', 'dwa', 'trzy', 'cztery', 'pięć', 'sześć', + 'siedem', 'osiem', 'dziewięć', 'dziesięć', 'jedenaście', 'dwanaście', 'trzynaście', 'czternaście', 'pietnaście', 'szesnaście', 'siedemnaście', 'osiemnaście', - 'dziewiętnaście', 'dwadzieścia', 'trzydzieści', 'czterdzieści', - 'pięćdziesiąt', 'szcześćdziesiąt', 'siedemdziesiąt', - 'osiemdziesiąt', 'dziewięćdziesiąt', 'sto', 'tysiąc', 'milion', - 'miliard', 'bilion', 'trylion'] + 'dziewiętnaście', 'dwadzieścia', 'trzydzieści', 'czterdzieści', + 'pięćdziesiąt', 'szcześćdziesiąt', 'siedemdziesiąt', + 'osiemdziesiąt', 'dziewięćdziesiąt', 'sto', + 'dwieście', 'trzysta', 'czterysta', 'pięćset', 'sześćset', + 'siedemset', 'osiemset', 'dziewięćset', 'tysiąc', 'milion', + 'miliard', 'bilion', 'biliard', 'trylion', 'tryliard', 'kwadrylion'] def like_num(text): diff --git a/spacy/lang/pl/polish_srx_rules_LICENSE.txt b/spacy/lang/pl/polish_srx_rules_LICENSE.txt new file mode 100644 index 000000000..995a1b0f7 --- /dev/null +++ b/spacy/lang/pl/polish_srx_rules_LICENSE.txt @@ -0,0 +1,23 @@ + +Copyright (c) 2019, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py new file mode 100644 index 000000000..8fdcaca41 --- /dev/null +++ b/spacy/lang/pl/punctuation.py @@ -0,0 +1,14 @@ +# coding: utf8 +from __future__ import unicode_literals +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +_quotes = QUOTES.replace("'", '') +_infixes = (LIST_ELLIPSES + LIST_ICONS + + [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), + r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes), + r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)]) + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/pl/stop_words.py b/spacy/lang/pl/stop_words.py index bdf2189b6..c1276898a 100644 --- a/spacy/lang/pl/stop_words.py +++ b/spacy/lang/pl/stop_words.py @@ -1,46 +1,80 @@ # encoding: utf8 + from __future__ import unicode_literals - -# Source: http://www.ranks.nl/stopwords/polish +#sources: https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt and https://github.com/stopwords-iso/stopwords-pl STOP_WORDS = set(""" -ach aj albo + a aby ach acz aczkolwiek aj albo ale alez +ależ ani az aż -bardzo bez bo być +bardziej bardzo beda bede bedzie bez bo bowiem by +byc byl byla byli bylo byly bym bynajmniej być był +była było były będzie będą będę -ci cię ciebie co czy +cala cali caly cała cały chce choć ci cie +ciebie cię co cokolwiek coraz cos coś czasami czasem czemu +czy czyli często -daleko dla dlaczego dlatego do dobrze dokąd dość dużo dwa dwaj dwie dwoje dziś -dzisiaj +daleko dla dlaczego dlatego do dobrze dokad dokąd +dosc dość duzo dużo dwa dwaj dwie dwoje dzis +dzisiaj dziś -gdyby gdzie +gdy gdyby gdyz gdyż gdzie gdziekolwiek gdzies gdzieś go +godz -go +i ich ile im inna inne inny +innych iv ix iz iż -ich ile im inny +ja jak jakas jakaś jakby jaki jakichs jakichś jakie +jakis jakiz jakiś jakiż jakkolwiek jako jakos jakoś je jeden +jedna jednak jednakze jednakże jedno jednym jedynie jego jej jemu +jesli jest jestem jeszcze jezeli jeśli jeżeli juz już ją -ja ją jak jakby jaki je jeden jedna jedno jego jej jemu jeśli jest jestem -jeżeli już +kazdy każdy kiedy kierunku kilka kilku kims kimś kto +ktokolwiek ktora ktore ktorego ktorej ktory ktorych ktorym ktorzy ktos +ktoś która które którego której który których którym którzy ku -każdy kiedy kierunku kto ku +lecz lub -lub +ma mają mam mamy mało mi miał miedzy +mimo między mna mnie mną moga mogą moi moim moj +moja moje moze mozliwe mozna może możliwe można mu musi +my mój -ma mają mam mi mną mnie moi mój moja moje może mu my +na nad nam nami nas nasi nasz nasza nasze +naszego naszych natomiast natychmiast nawet nia nic nich nie niech +niego niej niemu nigdy nim nimi niz nią niż no -na nam nami nas nasi nasz nasza nasze natychmiast nią nic nich nie niego niej -niemu nigdy nim nimi niż +o obok od ok około on ona one +oni ono oraz oto owszem -obok od około on ona one oni ono owszem +pan pana pani po pod podczas pomimo ponad +poniewaz ponieważ powinien powinna powinni powinno poza prawie przeciez +przecież przed przede przedtem przez przy -po pod ponieważ przed przedtem +raz razie roku rowniez również -są sam sama się skąd +sam sama sie się skad skąd soba sobie sobą +sposob sposób swoje są -tak taki tam ten to tobą tobie tu tutaj twoi twój twoja twoje ty +ta tak taka taki takich takie takze także tam +te tego tej tel temu ten teraz też to toba +tobie tobą totez toteż totobą trzeba tu tutaj twoi twoim +twoj twoja twoje twym twój ty tych tylko tym tys +tzw tę -wam wami was wasi wasz wasza wasze we więc wszystko wtedy wy +u -żaden zawsze że -""".split()) +vi vii viii + +w wam wami was wasi wasz wasza wasze we +według wie wiele wielu więc więcej wlasnie wszyscy wszystkich wszystkie +wszystkim wszystko wtedy wy właśnie wśród + +xi xii xiii xiv xv + +z za zaden zadna zadne zadnych zapewne zawsze zaś +ze zeby znow znowu znów zostal został + +żaden żadna żadne żadnych że żeby""".split()) diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py index aa3f55d22..bf493dc43 100644 --- a/spacy/lang/pl/tokenizer_exceptions.py +++ b/spacy/lang/pl/tokenizer_exceptions.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN, ADP - +from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS _exc = {} @@ -32,5 +32,7 @@ for orth in [ "wyj.", "xx.", "ks.", "x.", "wyd.", "wsch.", "o.o."]: _exc[orth] = [{ORTH: orth}] +for orth in PL_BASE_EXCEPTIONS: + _exc[orth] = [{ORTH: orth}] TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index a0c6f4540..2202a1823 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -189,6 +189,9 @@ def uk_tokenizer(): def ca_tokenizer(): return util.get_lang_class('ca').Defaults.create_tokenizer() +@pytest.fixture(scope='session') +def pl_tokenizer(): + return util.get_lang_class('pl').Defaults.create_tokenizer() @pytest.fixture def stringstore(): diff --git a/spacy/tests/lang/pl/test_text.py b/spacy/tests/lang/pl/test_text.py new file mode 100644 index 000000000..5a19f3c99 --- /dev/null +++ b/spacy/tests/lang/pl/test_text.py @@ -0,0 +1,17 @@ +# coding: utf-8 +"""Words like numbers are recognized correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text,match', [ + ('10', True), ('1', True), ('10,000', True), ('10,00', True), + ('jeden', True), ('dwa', True), ('milion', True), + ('pies', False), (',', False), ('1/2', True)]) +def test_lex_attrs_like_number(pl_tokenizer, text, match): + tokens = pl_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/pl/test_tokenizer.py b/spacy/tests/lang/pl/test_tokenizer.py new file mode 100644 index 000000000..27eb9af1c --- /dev/null +++ b/spacy/tests/lang/pl/test_tokenizer.py @@ -0,0 +1,60 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +DOT_TESTS = [ + ('tel.', ['tel.']), + ('np.', ['np.']), + ('godz. 21:37', ['godz.', '21:37']), + ('inż.', ['inż.']), + ('gosp.-polit.', ['gosp.-polit.']), + ('ppoż', ['ppoż']), + ('płn', ['płn']), + ('ul.', ['ul.']), + ('jw.', ['jw.']), + ('itd.', ['itd.']), + ('cdn.', ['cdn.']), + ('itp.', ['itp.']), + ('10,- zł', ['10,-', 'zł']), + ('0 zł 99 gr', ['0', 'zł', '99', 'gr']), + ('0,99 rub.', ['0,99', 'rub.']), + ('dol.', ['dol.']), + ('1000 m n.p.m.', ['1000', 'm', 'n.p.m.']), + ('m.in.', ['m.in.']), + ('p.n.e.', ['p.n.e.']), + ('Sz.P.', ['Sz.P.']), + ('p.o.', ['p.o.']), + ('k.o.', ['k.o.']), + ('m.st.', ['m.st.']), + ('dra.', ['dra', '.']), + ('pp.', ['pp.']), + ('oo.', ['oo.']) +] + +HYPHEN_TESTS = [ + ('5-fluoropentylo-3-pirydynyloindol', ['5-fluoropentylo-3-pirydynyloindol']), + ('NESS-040C5', ['NESS-040C5']), + ('JTE-7-31', ['JTE-7-31']), + ('BAY-59-3074', ['BAY-59-3074']), + ('BAY-38-7271', ['BAY-38-7271']), + ('STS-135', ['STS-135']), + ('5F-PB-22', ['5F-PB-22']), + ('cztero-', ['cztero-']), + ('jedno-', ['jedno-']), + ('dwu-', ['dwu-']), + ('trzy-', ['trzy-']), + ('b-adoratorzy', ['b-adoratorzy']), + ('2-3-4 drzewa', ['2-3-4', 'drzewa']), + ('b-drzewa', ['b-drzewa']) +] + + +TESTCASES = DOT_TESTS + HYPHEN_TESTS + + +@pytest.mark.parametrize('text,expected_tokens', TESTCASES) +def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens): + tokens = pl_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list