mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Merge branch 'master' into develop
This commit is contained in:
commit
9e652afa4b
106
.github/contributors/DoomCoder.md
vendored
Normal file
106
.github/contributors/DoomCoder.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Piotr Książek |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 22.11.2018 |
|
||||||
|
| GitHub username | DoomCoder |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/Gizzio.md
vendored
Normal file
106
.github/contributors/Gizzio.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [X] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Stanisław Giziński |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 21.11.2018 |
|
||||||
|
| GitHub username | Gizzio |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/MateuszOlko.md
vendored
Normal file
106
.github/contributors/MateuszOlko.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Mateusz Olko |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 22.11.2018 |
|
||||||
|
| GitHub username | MateuszOlko |
|
||||||
|
| Website (optional) | |
|
106
.github/contributors/kowaalczyk.md
vendored
Normal file
106
.github/contributors/kowaalczyk.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name |Krzysztof Kowalczyk |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date |22.11.2018 |
|
||||||
|
| GitHub username |kowaalczyk |
|
||||||
|
| Website (optional) |kowaalczyk.pl |
|
|
@ -4,6 +4,8 @@ from __future__ import unicode_literals
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
@ -14,11 +16,13 @@ from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
class PolishDefaults(Language.Defaults):
|
class PolishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: "pl"
|
lex_attr_getters[LANG] = lambda text: "pl"
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
)
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
infixes = tuple(TOKENIZER_INFIXES)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
|
||||||
|
|
1441
spacy/lang/pl/_tokenizer_exceptions_list.py
Normal file
1441
spacy/lang/pl/_tokenizer_exceptions_list.py
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -34,11 +34,22 @@ _num_words = [
|
||||||
"osiemdziesiąt",
|
"osiemdziesiąt",
|
||||||
"dziewięćdziesiąt",
|
"dziewięćdziesiąt",
|
||||||
"sto",
|
"sto",
|
||||||
|
"dwieście",
|
||||||
|
"trzysta",
|
||||||
|
"czterysta",
|
||||||
|
"pięćset",
|
||||||
|
"sześćset",
|
||||||
|
"siedemset",
|
||||||
|
"osiemset",
|
||||||
|
"dziewięćset",
|
||||||
"tysiąc",
|
"tysiąc",
|
||||||
"milion",
|
"milion",
|
||||||
"miliard",
|
"miliard",
|
||||||
"bilion",
|
"bilion",
|
||||||
|
"biliard",
|
||||||
"trylion",
|
"trylion",
|
||||||
|
"tryliard",
|
||||||
|
"kwadrylion",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
23
spacy/lang/pl/polish_srx_rules_LICENSE.txt
Normal file
23
spacy/lang/pl/polish_srx_rules_LICENSE.txt
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
|
||||||
|
Copyright (c) 2019, Marcin Miłkowski
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright notice, this
|
||||||
|
list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
14
spacy/lang/pl/punctuation.py
Normal file
14
spacy/lang/pl/punctuation.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||||
|
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
_quotes = QUOTES.replace("'", '')
|
||||||
|
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||||
|
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||||
|
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
||||||
|
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||||
|
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||||
|
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
|
||||||
|
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)])
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
|
@ -1,48 +1,82 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
# sources: https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt and https://github.com/stopwords-iso/stopwords-pl
|
||||||
# Source: http://www.ranks.nl/stopwords/polish
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
ach aj albo
|
a aby ach acz aczkolwiek aj albo ale alez
|
||||||
|
ależ ani az aż
|
||||||
|
|
||||||
bardzo bez bo być
|
bardziej bardzo beda bede bedzie bez bo bowiem by
|
||||||
|
byc byl byla byli bylo byly bym bynajmniej być był
|
||||||
|
była było były będzie będą będę
|
||||||
|
|
||||||
ci cię ciebie co czy
|
cala cali caly cała cały chce choć ci cie
|
||||||
|
ciebie cię co cokolwiek coraz cos coś czasami czasem czemu
|
||||||
|
czy czyli często
|
||||||
|
|
||||||
daleko dla dlaczego dlatego do dobrze dokąd dość dużo dwa dwaj dwie dwoje dziś
|
daleko dla dlaczego dlatego do dobrze dokad dokąd
|
||||||
dzisiaj
|
dosc dość duzo dużo dwa dwaj dwie dwoje dzis
|
||||||
|
dzisiaj dziś
|
||||||
|
|
||||||
gdyby gdzie
|
gdy gdyby gdyz gdyż gdzie gdziekolwiek gdzies gdzieś go
|
||||||
|
godz
|
||||||
|
|
||||||
go
|
i ich ile im inna inne inny
|
||||||
|
innych iv ix iz iż
|
||||||
|
|
||||||
ich ile im inny
|
ja jak jakas jakaś jakby jaki jakichs jakichś jakie
|
||||||
|
jakis jakiz jakiś jakiż jakkolwiek jako jakos jakoś je jeden
|
||||||
|
jedna jednak jednakze jednakże jedno jednym jedynie jego jej jemu
|
||||||
|
jesli jest jestem jeszcze jezeli jeśli jeżeli juz już ją
|
||||||
|
|
||||||
ja ją jak jakby jaki je jeden jedna jedno jego jej jemu jeśli jest jestem
|
kazdy każdy kiedy kierunku kilka kilku kims kimś kto
|
||||||
jeżeli już
|
ktokolwiek ktora ktore ktorego ktorej ktory ktorych ktorym ktorzy ktos
|
||||||
|
ktoś która które którego której który których którym którzy ku
|
||||||
|
|
||||||
każdy kiedy kierunku kto ku
|
lecz lub
|
||||||
|
|
||||||
lub
|
ma mają mam mamy mało mi miał miedzy
|
||||||
|
mimo między mna mnie mną moga mogą moi moim moj
|
||||||
|
moja moje moze mozliwe mozna może możliwe można mu musi
|
||||||
|
my mój
|
||||||
|
|
||||||
ma mają mam mi mną mnie moi mój moja moje może mu my
|
na nad nam nami nas nasi nasz nasza nasze
|
||||||
|
naszego naszych natomiast natychmiast nawet nia nic nich nie niech
|
||||||
|
niego niej niemu nigdy nim nimi niz nią niż no
|
||||||
|
|
||||||
na nam nami nas nasi nasz nasza nasze natychmiast nią nic nich nie niego niej
|
o obok od ok około on ona one
|
||||||
niemu nigdy nim nimi niż
|
oni ono oraz oto owszem
|
||||||
|
|
||||||
obok od około on ona one oni ono owszem
|
pan pana pani po pod podczas pomimo ponad
|
||||||
|
poniewaz ponieważ powinien powinna powinni powinno poza prawie przeciez
|
||||||
|
przecież przed przede przedtem przez przy
|
||||||
|
|
||||||
po pod ponieważ przed przedtem
|
raz razie roku rowniez również
|
||||||
|
|
||||||
są sam sama się skąd
|
sam sama sie się skad skąd soba sobie sobą
|
||||||
|
sposob sposób swoje są
|
||||||
|
|
||||||
tak taki tam ten to tobą tobie tu tutaj twoi twój twoja twoje ty
|
ta tak taka taki takich takie takze także tam
|
||||||
|
te tego tej tel temu ten teraz też to toba
|
||||||
|
tobie tobą totez toteż totobą trzeba tu tutaj twoi twoim
|
||||||
|
twoj twoja twoje twym twój ty tych tylko tym tys
|
||||||
|
tzw tę
|
||||||
|
|
||||||
wam wami was wasi wasz wasza wasze we więc wszystko wtedy wy
|
u
|
||||||
|
|
||||||
żaden zawsze że
|
vi vii viii
|
||||||
""".split()
|
|
||||||
|
w wam wami was wasi wasz wasza wasze we
|
||||||
|
według wie wiele wielu więc więcej wlasnie wszyscy wszystkich wszystkie
|
||||||
|
wszystkim wszystko wtedy wy właśnie wśród
|
||||||
|
|
||||||
|
xi xii xiii xiv xv
|
||||||
|
|
||||||
|
z za zaden zadna zadne zadnych zapewne zawsze zaś
|
||||||
|
ze zeby znow znowu znów zostal został
|
||||||
|
|
||||||
|
żaden żadna żadne żadnych że żeby""".split()
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN
|
from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -19,5 +19,7 @@ for exc_data in [
|
||||||
for orth in ["w.", "r."]:
|
for orth in ["w.", "r."]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
for orth in PL_BASE_EXCEPTIONS:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .morph_rules import MORPH_RULES
|
from .morph_rules import MORPH_RULES
|
||||||
from .lemmatizer import LEMMA_RULES, LOOKUP
|
from .lemmatizer import LEMMA_RULES, LOOKUP
|
||||||
|
@ -22,6 +23,7 @@ class SwedishDefaults(Language.Defaults):
|
||||||
)
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
morph_rules = MORPH_RULES
|
morph_rules = MORPH_RULES
|
||||||
|
tag_map = TAG_MAP
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
169
spacy/lang/sv/tag_map.py
Normal file
169
spacy/lang/sv/tag_map.py
Normal file
|
@ -0,0 +1,169 @@
|
||||||
|
# coding: utf8
|
||||||
|
|
||||||
|
"""
|
||||||
|
Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html
|
||||||
|
for https://github.com/UniversalDependencies/UD_Swedish-Talbanken
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X, VERB
|
||||||
|
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
|
||||||
|
|
||||||
|
TAG_MAP = {
|
||||||
|
'AB': { POS: ADV }, # inte, också, så, bara, nu
|
||||||
|
'AB|AN': { POS: ADV }, # t.ex., ca, t_ex, bl.a., s_k
|
||||||
|
'AB|KOM': { POS: ADV }, # mer, tidigare, mindre, vidare, mera
|
||||||
|
'AB|POS': { POS: ADV }, # mycket, helt, ofta, länge, långt
|
||||||
|
'AB|SMS': { POS: ADV }, # över-, in-
|
||||||
|
'AB|SUV': { POS: ADV }, # minst, mest, högst, främst, helst
|
||||||
|
'DT|MAS|SIN|DEF': { POS: DET },
|
||||||
|
'DT|MAS|SIN|IND': { POS: DET },
|
||||||
|
'DT|NEU|SIN|DEF': { POS: DET }, # det, detta
|
||||||
|
'DT|NEU|SIN|IND': { POS: DET }, # ett, något, inget, vart, vartannat
|
||||||
|
'DT|NEU|SIN|IND/DEF': { POS: DET }, # allt
|
||||||
|
'DT|UTR/NEU|PLU|DEF': { POS: DET }, # de, dessa, bägge, dom
|
||||||
|
'DT|UTR/NEU|PLU|IND': { POS: DET }, # några, inga
|
||||||
|
'DT|UTR/NEU|PLU|IND/DEF': { POS: DET }, # alla
|
||||||
|
'DT|UTR/NEU|SIN/PLU|IND': { POS: DET }, # samma
|
||||||
|
'DT|UTR/NEU|SIN|DEF': { POS: DET }, # vardera
|
||||||
|
'DT|UTR/NEU|SIN|IND': { POS: DET }, # varje, varenda
|
||||||
|
'DT|UTR|SIN|DEF': { POS: DET }, # den, denna
|
||||||
|
'DT|UTR|SIN|IND': { POS: DET }, # en, någon, ingen, var, varannan
|
||||||
|
'DT|UTR|SIN|IND/DEF': { POS: DET }, # all
|
||||||
|
'HA': { POS: ADV }, # när, där, hur, som, då
|
||||||
|
'HD|NEU|SIN|IND': { POS: DET }, # vilket
|
||||||
|
'HD|UTR/NEU|PLU|IND': { POS: DET }, # vilka
|
||||||
|
'HD|UTR|SIN|IND': { POS: DET }, # vilken
|
||||||
|
'HP|-|-|-': { POS: PRON }, # som
|
||||||
|
'HP|NEU|SIN|IND': { POS: PRON }, # vad, vilket
|
||||||
|
'HP|NEU|SIN|IND|SMS': { POS: PRON },
|
||||||
|
'HP|UTR/NEU|PLU|IND': { POS: PRON }, # vilka
|
||||||
|
'HP|UTR|SIN|IND': { POS: PRON }, # vilken, vem
|
||||||
|
'HS|DEF': { POS: DET }, # vars, vilkas, Vems
|
||||||
|
'IE': { POS: PART }, # att
|
||||||
|
'IN': { POS: INTJ }, # Jo, ja, nej, fan, visst
|
||||||
|
'JJ|AN': { POS: ADJ }, # ev, S:t, Kungl, Kungl., Teol
|
||||||
|
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: ADJ }, # äldres
|
||||||
|
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # större, högre, mindre, bättre, äldre
|
||||||
|
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS': { POS: ADJ },
|
||||||
|
'JJ|POS|MAS|SIN|DEF|GEN': { POS: ADJ }, # enskildes, sjukes, andres
|
||||||
|
'JJ|POS|MAS|SIN|DEF|NOM': { POS: ADJ }, # enskilde, sjuke, andre, unge, ene
|
||||||
|
'JJ|POS|NEU|SIN|IND/DEF|NOM': { POS: ADJ }, # eget
|
||||||
|
'JJ|POS|NEU|SIN|IND|GEN': { POS: ADJ },
|
||||||
|
'JJ|POS|NEU|SIN|IND|NOM': { POS: ADJ }, # annat, svårt, möjligt, nytt, sådant
|
||||||
|
'JJ|POS|UTR/NEU|PLU|IND/DEF|GEN': { POS: ADJ }, # ogiftas, ungas, frånskildas, efterkommandes, färgblindas
|
||||||
|
'JJ|POS|UTR/NEU|PLU|IND/DEF|NOM': { POS: ADJ }, # olika, andra, många, stora, vissa
|
||||||
|
'JJ|POS|UTR/NEU|PLU|IND|NOM': { POS: ADJ }, # flera, sådana, fler, få, samtliga
|
||||||
|
'JJ|POS|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ },
|
||||||
|
'JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # bra, ena, enda, nästa, ringa
|
||||||
|
'JJ|POS|UTR/NEU|SIN|DEF|GEN': { POS: ADJ },
|
||||||
|
'JJ|POS|UTR/NEU|SIN|DEF|NOM': { POS: ADJ }, # hela, nya, andra, svenska, ekonomiska
|
||||||
|
'JJ|POS|UTR|-|-|SMS': { POS: ADJ }, # fri-, låg-, sexual-
|
||||||
|
'JJ|POS|UTR|SIN|IND/DEF|NOM': { POS: ADJ }, # egen
|
||||||
|
'JJ|POS|UTR|SIN|IND|GEN': { POS: ADJ }, # enskilds
|
||||||
|
'JJ|POS|UTR|SIN|IND|NOM': { POS: ADJ }, # stor, annan, själv, sådan, viss
|
||||||
|
'JJ|SUV|MAS|SIN|DEF|GEN': { POS: ADJ },
|
||||||
|
'JJ|SUV|MAS|SIN|DEF|NOM': { POS: ADJ }, # störste, främste, äldste, minste
|
||||||
|
'JJ|SUV|UTR/NEU|PLU|DEF|NOM': { POS: ADJ }, # flesta
|
||||||
|
'JJ|SUV|UTR/NEU|PLU|IND|NOM': { POS: ADJ },
|
||||||
|
'JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM': { POS: ADJ }, # bästa, största, närmaste, viktigaste, högsta
|
||||||
|
'JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ }, # störst, bäst, tidigast, högst, fattigast
|
||||||
|
'KN': { POS: CCONJ }, # och, eller, som, än, men
|
||||||
|
'KN|AN': { POS: CCONJ },
|
||||||
|
'MAD': { POS: PUNCT }, # ., ?, :, !, ...
|
||||||
|
'MID': { POS: PUNCT }, # ,, -, :, *, ;
|
||||||
|
'NN|-|-|-|-': { POS: NOUN }, # godo, fjol, fullo, somras, måtto
|
||||||
|
'NN|AN': { POS: NOUN }, # kr, %, s., dr, kap.
|
||||||
|
'NN|NEU|-|-|-': { POS: NOUN },
|
||||||
|
'NN|NEU|-|-|SMS': { POS: NOUN }, # yrkes-, barn-, hem-, fack-, vatten-
|
||||||
|
'NN|NEU|PLU|DEF|GEN': { POS: NOUN }, # barnens, årens, u-ländernas, företagens, århundradenas
|
||||||
|
'NN|NEU|PLU|DEF|NOM': { POS: NOUN }, # barnen, u-länderna, åren, länderna, könen
|
||||||
|
'NN|NEU|PLU|IND|GEN': { POS: NOUN }, # slags, års, barns, länders, tusentals
|
||||||
|
'NN|NEU|PLU|IND|NOM': { POS: NOUN }, # barn, år, fall, länder, problem
|
||||||
|
'NN|NEU|SIN|DEF|GEN': { POS: NOUN }, # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets
|
||||||
|
'NN|NEU|SIN|DEF|NOM': { POS: NOUN }, # äktenskapet, samhället, barnet, stället, hemmet
|
||||||
|
'NN|NEU|SIN|IND|GEN': { POS: NOUN }, # års, slags, lands, havs, företags
|
||||||
|
'NN|NEU|SIN|IND|NOM': { POS: NOUN }, # år, arbete, barn, sätt, äktenskap
|
||||||
|
'NN|SMS': { POS: NOUN }, # PCB-, Syd-
|
||||||
|
'NN|UTR|-|-|-': { POS: NOUN }, # dags, rätta
|
||||||
|
'NN|UTR|-|-|SMS': { POS: NOUN }, # far-, kibbutz-, röntgen-, barna-, hälso-
|
||||||
|
'NN|UTR|PLU|DEF|GEN': { POS: NOUN }, # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas
|
||||||
|
'NN|UTR|PLU|DEF|NOM': { POS: NOUN }, # kvinnorna, föräldrarna, makarna, männen, hyrorna
|
||||||
|
'NN|UTR|PLU|IND|GEN': { POS: NOUN }, # människors, kvinnors, dagars, tiders, månaders
|
||||||
|
'NN|UTR|PLU|IND|NOM': { POS: NOUN }, # procent, människor, kvinnor, miljoner, kronor
|
||||||
|
'NN|UTR|SIN|DEF|GEN': { POS: NOUN }, # kvinnans, världens, familjens, dagens, jordens
|
||||||
|
'NN|UTR|SIN|DEF|NOM': { POS: NOUN }, # familjen, kvinnan, mannen, världen, skolan
|
||||||
|
'NN|UTR|SIN|IND|GEN': { POS: NOUN }, # sorts, medelålders, makes, kvinnas, veckas
|
||||||
|
'NN|UTR|SIN|IND|NOM': { POS: NOUN }, # del, tid, dag, fråga, man
|
||||||
|
'PAD': { POS: PUNCT }, # , ), (
|
||||||
|
'PC|AN': { POS: VERB },
|
||||||
|
'PC|PRF|MAS|SIN|DEF|GEN': { POS: VERB }, # avlidnes
|
||||||
|
'PC|PRF|MAS|SIN|DEF|NOM': { POS: VERB },
|
||||||
|
'PC|PRF|NEU|SIN|IND|NOM': { POS: VERB }, # taget, sett, särskilt, förbjudet, ökat
|
||||||
|
'PC|PRF|UTR/NEU|PLU|IND/DEF|GEN': { POS: VERB }, # försäkrades, anställdas
|
||||||
|
'PC|PRF|UTR/NEU|PLU|IND/DEF|NOM': { POS: VERB }, # särskilda, gifta, ökade, handikappade, skilda
|
||||||
|
'PC|PRF|UTR/NEU|SIN|DEF|GEN': { POS: VERB },
|
||||||
|
'PC|PRF|UTR/NEU|SIN|DEF|NOM': { POS: VERB }, # ökade, gifta, nämnda, nedärvda, dolda
|
||||||
|
'PC|PRF|UTR|SIN|IND|GEN': { POS: VERB },
|
||||||
|
'PC|PRF|UTR|SIN|IND|NOM': { POS: VERB }, # särskild, ökad, beredd, gift, oförändrad
|
||||||
|
'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: VERB }, # studerandes, sammanboendes, dubbelarbetandes
|
||||||
|
'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: VERB }, # följande, beroende, nuvarande, motsvarande, liknande
|
||||||
|
'PL': { POS: PART }, # ut, upp, in, till, med
|
||||||
|
'PL|SMS': { POS: PART },
|
||||||
|
'PM': { POS: PROPN }, # F, N, Liechtenstein, Danmark, DK
|
||||||
|
'PM|GEN': { POS: PROPN }, # Sveriges, EEC:s, Guds, Stockholms, Kristi
|
||||||
|
'PM|NOM': { POS: PROPN }, # Sverige, EEC, Stockholm, USA, ATP
|
||||||
|
'PM|SMS': { POS: PROPN }, # Göteborgs-, Nord-, Väst-
|
||||||
|
'PN|MAS|SIN|DEF|SUB/OBJ': { POS: PRON }, # denne
|
||||||
|
'PN|NEU|SIN|DEF|SUB/OBJ': { POS: PRON }, # det, detta, detsamma
|
||||||
|
'PN|NEU|SIN|IND|SUB/OBJ': { POS: PRON }, # något, allt, mycket, annat, ingenting
|
||||||
|
'PN|UTR/NEU|PLU|DEF|OBJ': { POS: PRON }, # dem, varandra, varann
|
||||||
|
'PN|UTR/NEU|PLU|DEF|SUB': { POS: PRON }, # de, bägge
|
||||||
|
'PN|UTR/NEU|PLU|DEF|SUB/OBJ': { POS: PRON }, # dessa, dom, båda, den, bådadera
|
||||||
|
'PN|UTR/NEU|PLU|IND|SUB/OBJ': { POS: PRON }, # andra, alla, många, sådana, några
|
||||||
|
'PN|UTR/NEU|SIN/PLU|DEF|OBJ': { POS: PRON }, # sig, sej
|
||||||
|
'PN|UTR|PLU|DEF|OBJ': { POS: PRON }, # oss, er, eder
|
||||||
|
'PN|UTR|PLU|DEF|SUB': { POS: PRON }, # vi
|
||||||
|
'PN|UTR|SIN|DEF|OBJ': { POS: PRON }, # dig, mig, henne, honom, Er
|
||||||
|
'PN|UTR|SIN|DEF|SUB': { POS: PRON }, # du, han, hon, jag, ni
|
||||||
|
'PN|UTR|SIN|DEF|SUB/OBJ': { POS: PRON }, # den, denna, densamma
|
||||||
|
'PN|UTR|SIN|IND|SUB': { POS: PRON }, # man
|
||||||
|
'PN|UTR|SIN|IND|SUB/OBJ': { POS: PRON }, # en, var, någon, ingen, Varannan
|
||||||
|
'PP': { POS: ADP }, # i, av, på, för, till
|
||||||
|
'PP|AN': { POS: ADP }, # f
|
||||||
|
'PS|AN': { POS: DET },
|
||||||
|
'PS|NEU|SIN|DEF': { POS: DET }, # sitt, vårt, ditt, mitt, ert
|
||||||
|
'PS|UTR/NEU|PLU|DEF': { POS: DET }, # sina, våra, dina, mina
|
||||||
|
'PS|UTR/NEU|SIN/PLU|DEF': { POS: DET }, # deras, dess, hans, hennes, varandras
|
||||||
|
'PS|UTR|SIN|DEF': { POS: DET }, # sin, vår, din, min, er
|
||||||
|
'RG': { POS: NUM }, # 2, 17, 20, 1, 18
|
||||||
|
'RG|GEN': { POS: NUM },
|
||||||
|
'RG|MAS|SIN|DEF|NOM': { POS: NUM },
|
||||||
|
'RG|NEU|SIN|IND|NOM': { POS: NUM }, # ett
|
||||||
|
'RG|NOM': { POS: NUM }, # två, tre, 1, 20, 2
|
||||||
|
'RG|SMS': { POS: NUM }, # ett-, 1950-, två-, tre-, 1700-
|
||||||
|
'RG|UTR/NEU|SIN|DEF|NOM': { POS: NUM },
|
||||||
|
'RG|UTR|SIN|IND|NOM': { POS: NUM }, # en
|
||||||
|
'RO|MAS|SIN|IND/DEF|GEN': { POS: ADJ },
|
||||||
|
'RO|MAS|SIN|IND/DEF|NOM': { POS: ADJ }, # förste
|
||||||
|
'RO|GEN': { POS: ADJ },
|
||||||
|
'RO|NOM': { POS: ADJ }, # första, andra, tredje, fjärde, femte
|
||||||
|
'SN': { POS: SCONJ }, # att, om, innan, eftersom, medan
|
||||||
|
'UO': { POS: X }, # companionship, vice, versa, family, capita
|
||||||
|
'VB|AN': { POS: VERB }, # jfr
|
||||||
|
'VB|IMP|AKT': { POS: VERB }, # se, Diskutera, låt, Läs, Gå
|
||||||
|
'VB|IMP|SFO': { POS: VERB }, # tas
|
||||||
|
'VB|INF|AKT': { POS: VERB }, # vara, få, ha, bli, kunna
|
||||||
|
'VB|INF|SFO': { POS: VERB }, # användas, finnas, göras, tas, ses
|
||||||
|
'VB|KON|PRS|AKT': { POS: VERB }, # vare, Gånge
|
||||||
|
'VB|KON|PRT|AKT': { POS: VERB }, # vore, finge
|
||||||
|
'VB|KON|PRT|SFO': { POS: VERB },
|
||||||
|
'VB|PRS|AKT': { POS: VERB }, # är, har, kan, får, måste
|
||||||
|
'VB|PRS|SFO': { POS: VERB }, # finns, kallas, behövs, beräknas, används
|
||||||
|
'VB|PRT|AKT': { POS: VERB }, # skulle, var, hade, kunde, fick
|
||||||
|
'VB|PRT|SFO': { POS: VERB }, # fanns, gjordes, höjdes, användes, infördes
|
||||||
|
'VB|SMS': { POS: VERB }, # läs-
|
||||||
|
'VB|SUP|AKT': { POS: VERB }, # varit, fått, blivit, haft, kommit
|
||||||
|
'VB|SUP|SFO': { POS: VERB } # nämnts, gjorts, förändrats, sagts, framhållits
|
||||||
|
}
|
|
@ -128,6 +128,11 @@ def ca_tokenizer():
|
||||||
return get_lang_class("ca").Defaults.create_tokenizer()
|
return get_lang_class("ca").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def pl_tokenizer():
|
||||||
|
return util.get_lang_class("pl").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def tt_tokenizer():
|
def tt_tokenizer():
|
||||||
return get_lang_class("tt").Defaults.create_tokenizer()
|
return get_lang_class("tt").Defaults.create_tokenizer()
|
||||||
|
|
17
spacy/tests/lang/pl/test_text.py
Normal file
17
spacy/tests/lang/pl/test_text.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Words like numbers are recognized correctly."""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,match', [
|
||||||
|
('10', True), ('1', True), ('10,000', True), ('10,00', True),
|
||||||
|
('jeden', True), ('dwa', True), ('milion', True),
|
||||||
|
('pies', False), (',', False), ('1/2', True)])
|
||||||
|
def test_lex_attrs_like_number(pl_tokenizer, text, match):
|
||||||
|
tokens = pl_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
assert tokens[0].like_num == match
|
60
spacy/tests/lang/pl/test_tokenizer.py
Normal file
60
spacy/tests/lang/pl/test_tokenizer.py
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
DOT_TESTS = [
|
||||||
|
('tel.', ['tel.']),
|
||||||
|
('np.', ['np.']),
|
||||||
|
('godz. 21:37', ['godz.', '21:37']),
|
||||||
|
('inż.', ['inż.']),
|
||||||
|
('gosp.-polit.', ['gosp.-polit.']),
|
||||||
|
('ppoż', ['ppoż']),
|
||||||
|
('płn', ['płn']),
|
||||||
|
('ul.', ['ul.']),
|
||||||
|
('jw.', ['jw.']),
|
||||||
|
('itd.', ['itd.']),
|
||||||
|
('cdn.', ['cdn.']),
|
||||||
|
('itp.', ['itp.']),
|
||||||
|
('10,- zł', ['10,-', 'zł']),
|
||||||
|
('0 zł 99 gr', ['0', 'zł', '99', 'gr']),
|
||||||
|
('0,99 rub.', ['0,99', 'rub.']),
|
||||||
|
('dol.', ['dol.']),
|
||||||
|
('1000 m n.p.m.', ['1000', 'm', 'n.p.m.']),
|
||||||
|
('m.in.', ['m.in.']),
|
||||||
|
('p.n.e.', ['p.n.e.']),
|
||||||
|
('Sz.P.', ['Sz.P.']),
|
||||||
|
('p.o.', ['p.o.']),
|
||||||
|
('k.o.', ['k.o.']),
|
||||||
|
('m.st.', ['m.st.']),
|
||||||
|
('dra.', ['dra', '.']),
|
||||||
|
('pp.', ['pp.']),
|
||||||
|
('oo.', ['oo.'])
|
||||||
|
]
|
||||||
|
|
||||||
|
HYPHEN_TESTS = [
|
||||||
|
('5-fluoropentylo-3-pirydynyloindol', ['5-fluoropentylo-3-pirydynyloindol']),
|
||||||
|
('NESS-040C5', ['NESS-040C5']),
|
||||||
|
('JTE-7-31', ['JTE-7-31']),
|
||||||
|
('BAY-59-3074', ['BAY-59-3074']),
|
||||||
|
('BAY-38-7271', ['BAY-38-7271']),
|
||||||
|
('STS-135', ['STS-135']),
|
||||||
|
('5F-PB-22', ['5F-PB-22']),
|
||||||
|
('cztero-', ['cztero-']),
|
||||||
|
('jedno-', ['jedno-']),
|
||||||
|
('dwu-', ['dwu-']),
|
||||||
|
('trzy-', ['trzy-']),
|
||||||
|
('b-adoratorzy', ['b-adoratorzy']),
|
||||||
|
('2-3-4 drzewa', ['2-3-4', 'drzewa']),
|
||||||
|
('b-drzewa', ['b-drzewa'])
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
TESTCASES = DOT_TESTS + HYPHEN_TESTS
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||||
|
def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens):
|
||||||
|
tokens = pl_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
assert expected_tokens == token_list
|
Loading…
Reference in New Issue
Block a user