diff --git a/.github/contributors/AMArostegui.md b/.github/contributors/AMArostegui.md new file mode 100644 index 000000000..0778a0035 --- /dev/null +++ b/.github/contributors/AMArostegui.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Antonio Miras | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 11/01/2020 | +| GitHub username | AMArostegui | +| Website (optional) | | diff --git a/.github/contributors/alexcombessie.md b/.github/contributors/alexcombessie.md new file mode 100644 index 000000000..7c5b22a5b --- /dev/null +++ b/.github/contributors/alexcombessie.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Alex COMBESSIE | +| Company name (if applicable) | Dataiku | +| Title or role (if applicable) | R&D Engineer | +| Date | 2020-10-27 | +| GitHub username | alexcombessie | +| Website (optional) | | diff --git a/.github/contributors/cristianasp.md b/.github/contributors/cristianasp.md new file mode 100644 index 000000000..a829098e9 --- /dev/null +++ b/.github/contributors/cristianasp.md @@ -0,0 +1,107 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Cristiana S Parada | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-11-04 | +| GitHub username | cristianasp | +| Website (optional) | | + diff --git a/.github/contributors/lorenanda.md b/.github/contributors/lorenanda.md new file mode 100644 index 000000000..90c4c541b --- /dev/null +++ b/.github/contributors/lorenanda.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Lorena Ciutacu | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-12-23 | +| GitHub username | lorenanda | +| Website (optional) | lorenaciutacu.com/ | diff --git a/.github/contributors/ophelielacroix.md b/.github/contributors/ophelielacroix.md new file mode 100644 index 000000000..2abdfdecf --- /dev/null +++ b/.github/contributors/ophelielacroix.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|-------------------------------|-----------------| +| Name | Ophélie Lacroix | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | | +| GitHub username | ophelielacroix | +| Website (optional) | | diff --git a/.github/contributors/yosiasz.md b/.github/contributors/yosiasz.md new file mode 100644 index 000000000..244cc30f5 --- /dev/null +++ b/.github/contributors/yosiasz.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Josiah Solomon | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-12-15 | +| GitHub username | yosiasz | +| Website (optional) | | diff --git a/spacy/lang/am/__init__.py b/spacy/lang/am/__init__.py new file mode 100644 index 000000000..0efd089a4 --- /dev/null +++ b/spacy/lang/am/__init__.py @@ -0,0 +1,34 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_SUFFIXES + +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups + + +class AmharicDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: "am" + lex_attr_getters[NORM] = add_lookups( + Language.Defaults.lex_attr_getters[NORM], BASE_NORMS + ) + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = STOP_WORDS + suffixes = TOKENIZER_SUFFIXES + writing_system = {"direction": "ltr", "has_case": False, "has_letters": True} + + +class Amharic(Language): + lang = "am" + Defaults = AmharicDefaults + + +__all__ = ["Amharic"] diff --git a/spacy/lang/am/examples.py b/spacy/lang/am/examples.py new file mode 100644 index 000000000..939501505 --- /dev/null +++ b/spacy/lang/am/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.am.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "አፕል የዩኬን ጅምር ድርጅት በ 1 ቢሊዮን ዶላር ለመግዛት አስቧል።", + "የራስ ገዝ መኪኖች የኢንሹራንስ ኃላፊነትን ወደ አምራቾች ያዛውራሉ", + "ሳን ፍራንሲስኮ የእግረኛ መንገድ አቅርቦት ሮቦቶችን ማገድን ይመለከታል", + "ለንደን በእንግሊዝ የምትገኝ ትልቅ ከተማ ናት።", + "የት ነህ?", + "የፈረንሳይ ፕሬዝዳንት ማናቸው?", + "የአሜሪካ ዋና ከተማ ምንድነው?", + "ባራክ ኦባማ መቼ ተወለደ?", +] diff --git a/spacy/lang/am/lex_attrs.py b/spacy/lang/am/lex_attrs.py new file mode 100644 index 000000000..389444fcf --- /dev/null +++ b/spacy/lang/am/lex_attrs.py @@ -0,0 +1,104 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + +_num_words = [ + "ዜሮ", + "አንድ", + "ሁለት", + "ሶስት", + "አራት", + "አምስት", + "ስድስት", + "ሰባት", + "ስምት", + "ዘጠኝ", + "አስር", + "አስራ አንድ", + "አስራ ሁለት", + "አስራ ሶስት", + "አስራ አራት", + "አስራ አምስት", + "አስራ ስድስት", + "አስራ ሰባት", + "አስራ ስምንት", + "አስራ ዘጠኝ", + "ሃያ", + "ሰላሳ", + "አርባ", + "ሃምሳ", + "ስልሳ", + "ሰባ", + "ሰማንያ", + "ዘጠና", + "መቶ", + "ሺህ", + "ሚሊዮን", + "ቢሊዮን", + "ትሪሊዮን", + "ኳድሪሊዮን", + "ገጅሊዮን", + "ባዝሊዮን" +] + +_ordinal_words = [ + "አንደኛ", + "ሁለተኛ", + "ሶስተኛ", + "አራተኛ", + "አምስተኛ", + "ስድስተኛ", + "ሰባተኛ", + "ስምንተኛ", + "ዘጠነኛ", + "አስረኛ", + "አስራ አንደኛ", + "አስራ ሁለተኛ", + "አስራ ሶስተኛ", + "አስራ አራተኛ", + "አስራ አምስተኛ", + "አስራ ስድስተኛ", + "አስራ ሰባተኛ", + "አስራ ስምንተኛ", + "አስራ ዘጠነኛ", + "ሃያኛ", + "ሰላሳኛ" + "አርባኛ", + "አምሳኛ", + "ስድሳኛ", + "ሰባኛ", + "ሰማንያኛ", + "ዘጠናኛ", + "መቶኛ", + "ሺኛ", + "ሚሊዮንኛ", + "ቢሊዮንኛ", + "ትሪሊዮንኛ" +] +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + + text_lower = text.lower() + if text_lower in _num_words: + return True + + # Check ordinal number + if text_lower in _ordinal_words: + return True + if text_lower.endswith("ኛ"): + if text_lower[:-2].isdigit(): + return True + + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/am/punctuation.py b/spacy/lang/am/punctuation.py new file mode 100644 index 000000000..f4da91122 --- /dev/null +++ b/spacy/lang/am/punctuation.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY +from ..char_classes import UNITS, ALPHA_UPPER + +_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split() + +_suffixes = ( + _list_punct + + LIST_ELLIPSES + + LIST_QUOTES + + [ + r"(?<=[0-9])\+", + # Amharic is written from Left-To-Right + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] +) + +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/am/stop_words.py b/spacy/lang/am/stop_words.py new file mode 100644 index 000000000..66f5d8834 --- /dev/null +++ b/spacy/lang/am/stop_words.py @@ -0,0 +1,10 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Stop words +STOP_WORDS = set( + """ +ግን አንቺ አንተ እናንተ ያንተ ያንቺ የናንተ ራስህን ራስሽን ራሳችሁን +""".split() +) \ No newline at end of file diff --git a/spacy/lang/am/tokenizer_exceptions.py b/spacy/lang/am/tokenizer_exceptions.py new file mode 100644 index 000000000..4c582c268 --- /dev/null +++ b/spacy/lang/am/tokenizer_exceptions.py @@ -0,0 +1,25 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA + + +_exc = {} + + +for exc_data in [ + {ORTH: "ት/ቤት", LEMMA: "ትምህርት ቤት"}, + {ORTH: "ወ/ሮ", LEMMA: PRON_LEMMA, NORM: "ወይዘሮ"}, + +]: + _exc[exc_data[ORTH]] = [exc_data] + + +for orth in [ + "ዓ.ም.", + "ኪ.ሜ.", +]: + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 8b8cf2236..3fb0fb41e 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -5,6 +5,8 @@ split_chars = lambda char: list(char.strip().split(" ")) merge_chars = lambda char: char.strip().replace(" ", "|") group_chars = lambda char: char.strip().replace(" ", "") +_ethiopic = r"\u1200-\u137F" + _bengali = r"\u0980-\u09FF" _hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F" @@ -221,7 +223,8 @@ _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian _lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower + _macedonian_lower _uncased = ( - _bengali + _ethiopic + + _bengali + _hebrew + _persian + _sinhala diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 0190656e5..88258a8df 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -7,6 +7,7 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .morph_rules import MORPH_RULES from ..tag_map import TAG_MAP +from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language @@ -24,6 +25,7 @@ class DanishDefaults(Language.Defaults): suffixes = TOKENIZER_SUFFIXES tag_map = TAG_MAP stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS class Danish(Language): diff --git a/spacy/lang/da/syntax_iterators.py b/spacy/lang/da/syntax_iterators.py new file mode 100644 index 000000000..c6b944193 --- /dev/null +++ b/spacy/lang/da/syntax_iterators.py @@ -0,0 +1,81 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import NOUN, PROPN, PRON, VERB, AUX +from ...errors import Errors + + +def noun_chunks(doclike): + def is_verb_token(tok): + return tok.pos in [VERB, AUX] + + def next_token(tok): + try: + return tok.nbor() + except IndexError: + return None + + def get_left_bound(doc, root): + left_bound = root + for tok in reversed(list(root.lefts)): + if tok.dep in np_left_deps: + left_bound = tok + return left_bound + + def get_right_bound(doc, root): + right_bound = root + for tok in root.rights: + if tok.dep in np_right_deps: + right = get_right_bound(doc, tok) + if list( + filter( + lambda t: is_verb_token(t) or t.dep in stop_deps, + doc[root.i : right.i], + ) + ): + break + else: + right_bound = right + return right_bound + + def get_bounds(doc, root): + return get_left_bound(doc, root), get_right_bound(doc, root) + + doc = doclike.doc + + if not doc.is_parsed: + raise ValueError(Errors.E029) + + if not len(doc): + return + + left_labels = [ + "det", + "fixed", + "nmod:poss", + "amod", + "flat", + "goeswith", + "nummod", + "appos", + ] + right_labels = ["fixed", "nmod:poss", "amod", "flat", "goeswith", "nummod", "appos"] + stop_labels = ["punct"] + + np_label = doc.vocab.strings.add("NP") + np_left_deps = [doc.vocab.strings.add(label) for label in left_labels] + np_right_deps = [doc.vocab.strings.add(label) for label in right_labels] + stop_deps = [doc.vocab.strings.add(label) for label in stop_labels] + + chunks = [] + prev_right = -1 + for token in doclike: + if token.pos in [PROPN, NOUN, PRON]: + left, right = get_bounds(doc, token) + if left.i <= prev_right: + continue + yield left.i, right.i + 1, np_label + prev_right = right.i + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 964a714ae..dbe1b2a51 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -319,7 +319,6 @@ for exc_data in [ # Other contractions with leading apostrophe for exc_data in [ - {ORTH: "cause", NORM: "because"}, {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, {ORTH: "ll", LEMMA: "will", NORM: "will"}, {ORTH: "nuff", LEMMA: "enough", NORM: "enough"}, diff --git a/spacy/lang/fr/stop_words.py b/spacy/lang/fr/stop_words.py index ae8432043..2027f0b32 100644 --- a/spacy/lang/fr/stop_words.py +++ b/spacy/lang/fr/stop_words.py @@ -4,87 +4,81 @@ from __future__ import unicode_literals STOP_WORDS = set( """ -a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons -allô alors anterieur anterieure anterieures apres après as assez attendu au +a à â abord afin ah ai aie ainsi ait allaient allons +alors anterieur anterieure anterieures apres après as assez attendu au aucun aucune aujourd aujourd'hui aupres auquel aura auraient aurait auront -aussi autre autrefois autrement autres autrui aux auxquelles auxquels avaient +aussi autre autrement autres autrui aux auxquelles auxquels avaient avais avait avant avec avoir avons ayant -bah bas basee bat beau beaucoup bien bigre boum bravo brrr +bas basee bat c' c’ ça car ce ceci cela celle celle-ci celle-là celles celles-ci celles-là celui celui-ci celui-là cent cependant certain certaine certaines certains certes ces -cet cette ceux ceux-ci ceux-là chacun chacune chaque cher chers chez chiche -chut chère chères ci cinq cinquantaine cinquante cinquantième cinquième clac -clic combien comme comment comparable comparables compris concernant contre -couic crac +cet cette ceux ceux-ci ceux-là chacun chacune chaque chez ci cinq cinquantaine cinquante +cinquantième cinquième combien comme comment compris concernant -d' d’ da dans de debout dedans dehors deja delà depuis dernier derniere derriere +d' d’ da dans de debout dedans dehors deja delà depuis derriere derrière des desormais desquelles desquels dessous dessus deux deuxième deuxièmement devant devers devra different differentes differents différent différente différentes différents dire directe directement dit dite dits divers diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont -douze douzième dring du duquel durant dès désormais +douze douzième du duquel durant dès désormais effet egale egalement egales eh elle elle-même elles elles-mêmes en encore enfin entre envers environ es ès est et etaient étaient etais étais etait était -etant étant etc été etre être eu euh eux eux-mêmes exactement excepté extenso -exterieur +etant étant etc été etre être eu eux eux-mêmes exactement excepté -fais faisaient faisant fait façon feront fi flac floc font +fais faisaient faisant fait façon feront font gens -ha hein hem hep hi ho holà hop hormis hors hou houp hue hui huit huitième hum -hurrah hé hélas i il ils importe +ha hem hep hi ho hormis hors hou houp hue hui huit huitième +hé i il ils importe j' j’ je jusqu jusque juste -l' l’ la laisser laquelle las le lequel les lesquelles lesquels leur leurs longtemps +l' l’ la laisser laquelle le lequel les lesquelles lesquels leur leurs longtemps lors lorsque lui lui-meme lui-même là lès -m' m’ ma maint maintenant mais malgre malgré maximale me meme memes merci mes mien -mienne miennes miens mille mince minimale moi moi-meme moi-même moindres moins -mon moyennant même mêmes +m' m’ ma maint maintenant mais malgre me meme memes merci mes mien +mienne miennes miens mille moi moi-meme moi-même moindres moins +mon même mêmes -n' n’ na naturel naturelle naturelles ne neanmoins necessaire necessairement neuf -neuvième ni nombreuses nombreux non nos notamment notre nous nous-mêmes nouveau -nul néanmoins nôtre nôtres +n' n’ na ne neanmoins neuvième ni nombreuses nombreux nos notamment +notre nous nous-mêmes nouvea nul néanmoins nôtre nôtres -o ô oh ohé ollé olé on ont onze onzième ore ou ouf ouias oust ouste outre +o ô on ont onze onzième ore ou ouias oust outre ouvert ouverte ouverts où -paf pan par parce parfois parle parlent parler parmi parseme partant -particulier particulière particulièrement pas passé pendant pense permet -personne peu peut peuvent peux pff pfft pfut pif pire plein plouf plus -plusieurs plutôt possessif possessifs possible possibles pouah pour pourquoi +par parce parfois parle parlent parler parmi parseme partant +pas pendant pense permet personne peu peut peuvent peux plus +plusieurs plutôt possible possibles pour pourquoi pourrais pourrait pouvait prealable precisement premier première premièrement -pres probable probante procedant proche près psitt pu puis puisque pur pure +pres procedant proche près pu puis puisque qu' qu’ quand quant quant-à-soi quanta quarante quatorze quatre quatre-vingt quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque quelques quels qui quiconque quinze quoi quoique -rare rarement rares relative relativement remarquable rend rendre restant reste -restent restrictif retour revoici revoilà rien +relative relativement rend rendre restant reste +restent retour revoici revoilà -s' s’ sa sacrebleu sait sans sapristi sauf se sein seize selon semblable semblaient +s' s’ sa sait sans sauf se seize selon semblable semblaient semble semblent sent sept septième sera seraient serait seront ses seul seule seulement si sien sienne siennes siens sinon six sixième soi soi-même soit -soixante son sont sous souvent specifique specifiques speculatif stop -strictement subtiles suffisant suffisante suffit suis suit suivant suivante -suivantes suivants suivre superpose sur surtout +soixante son sont sous souvent specifique specifiques stop +suffisant suffisante suffit suis suit suivant suivante +suivantes suivants suivre sur surtout -t' t’ ta tac tant tardive te tel telle tellement telles tels tenant tend tenir tente -tes tic tien tienne tiennes tiens toc toi toi-même ton touchant toujours tous -tout toute toutefois toutes treize trente tres trois troisième troisièmement -trop très tsoin tsouin tu té +t' t’ ta tant te tel telle tellement telles tels tenant tend tenir tente +tes tien tienne tiennes tiens toi toi-même ton touchant toujours tous +tout toute toutes treize trente tres trois troisième troisièmement +tu té -un une unes uniformement unique uniques uns +un une unes uns -va vais vas vers via vif vifs vingt vivat vive vives vlan voici voilà vont vos +va vais vas vers via vingt voici voilà vont vos votre vous vous-mêmes vu vé vôtre vôtres -zut """.split() ) diff --git a/spacy/lang/pt/stop_words.py b/spacy/lang/pt/stop_words.py index 774b06809..c64ec2fe9 100644 --- a/spacy/lang/pt/stop_words.py +++ b/spacy/lang/pt/stop_words.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals STOP_WORDS = set( """ -à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes +a à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes ao aos apenas apoia apoio apontar após aquela aquelas aquele aqueles aqui aquilo as assim através atrás até aí @@ -18,7 +18,7 @@ da daquela daquele dar das de debaixo demais dentro depois des desde dessa desse desta deste deve devem deverá dez dezanove dezasseis dezassete dezoito diante direita disso diz dizem dizer do dois dos doze duas dá dão -é és ela elas ele eles em embora enquanto entre então era essa essas esse esses esta +e é és ela elas ele eles em embora enquanto entre então era essa essas esse esses esta estado estar estará estas estava este estes esteve estive estivemos estiveram estiveste estivestes estou está estás estão eu eventual exemplo @@ -40,7 +40,7 @@ na nada naquela naquele nas nem nenhuma nessa nesse nesta neste no nos nossa nossas nosso nossos nova novas nove novo novos num numa nunca nuns não nível nós número números -obrigada obrigado oitava oitavo oito onde ontem onze ora os ou outra outras outros +o obrigada obrigado oitava oitavo oito onde ontem onze ora os ou outra outras outros para parece parte partir pegar pela pelas pelo pelos perto pode podem poder poderá podia pois ponto pontos por porquanto porque porquê portanto porém posição @@ -63,8 +63,8 @@ tudo tão têm um uma umas uns usa usar último vai vais valor veja vem vens ver vez vezes vinda vindo vinte você vocês vos vossa -vossas vosso vossos vários vão vêm vós +vossas vosso vossos vários vão vêm vós zero """.split() -) +) diff --git a/spacy/lang/ro/stop_words.py b/spacy/lang/ro/stop_words.py index b5ba73458..2f6af4695 100644 --- a/spacy/lang/ro/stop_words.py +++ b/spacy/lang/ro/stop_words.py @@ -12,11 +12,13 @@ aceasta această aceea aceeasi +aceeași acei aceia acel acela acelasi +același acele acelea acest @@ -28,12 +30,11 @@ acestia acestui aceşti aceştia -acești -aceștia acolo acord acum adica +adică ai aia aibă @@ -57,6 +58,8 @@ alături am anume apoi +apai +apăi ar are as @@ -154,7 +157,9 @@ că căci cărei căror +cărora cărui +căruia către d da @@ -179,6 +184,8 @@ deşi deși din dinaintea +dincolo +dincoace dintr dintr- dintre @@ -190,6 +197,10 @@ drept dupa după dă +deunaseara +deunăseară +deunazi +deunăzi e ea ei @@ -224,7 +235,6 @@ geaba graţie grație h -halbă i ia iar @@ -236,6 +246,7 @@ in inainte inapoi inca +incotro incit insa intr @@ -256,6 +267,10 @@ m ma mai mare +macar +măcar +mata +matale mea mei mele @@ -278,11 +293,18 @@ mâine mîine mă n +na ne +neincetat +neîncetat nevoie ni nici +nicidecum +nicidecat +nicidecât niciodata +niciodată nicăieri nimeni nimeri @@ -304,6 +326,10 @@ noștri nu numai o +odata +odată +odinioara +odinioară opt or ori @@ -318,7 +344,9 @@ oricît oriunde p pai +păi parca +parcă patra patru patrulea @@ -335,13 +363,11 @@ prima primul prin printr- +printre putini puţin puţina puţină -puțin -puțina -puțină până pînă r @@ -419,6 +445,7 @@ unuia unul v va +vai vi voastre voastră diff --git a/spacy/lang/ti/__init__.py b/spacy/lang/ti/__init__.py new file mode 100644 index 000000000..05c7ea847 --- /dev/null +++ b/spacy/lang/ti/__init__.py @@ -0,0 +1,34 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_SUFFIXES + +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups + + +class TigrinyaDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: "ti" + lex_attr_getters[NORM] = add_lookups( + Language.Defaults.lex_attr_getters[NORM], BASE_NORMS + ) + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = STOP_WORDS + suffixes = TOKENIZER_SUFFIXES + writing_system = {"direction": "ltr", "has_case": False, "has_letters": True} + + +class Tigrinya(Language): + lang = "ti" + Defaults = TigrinyaDefaults + + +__all__ = ["Tigrinya"] diff --git a/spacy/lang/ti/examples.py b/spacy/lang/ti/examples.py new file mode 100644 index 000000000..edae9fd27 --- /dev/null +++ b/spacy/lang/ti/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.ti.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "አፕል ብዩኬ ትርከብ ንግድ ብ1 ቢሊዮን ዶላር ንምግዛዕ ሐሲባ።", + "ፈላማይ ክታበት ኮቪድ 19 ተጀሚሩ፤ሓዱሽ ተስፋ ሂቡ ኣሎ", + "ቻንስለር ጀርመን ኣንገላ መርከል ዝርግሓ ቫይረስ ኮሮና ንምክልካል ጽኑዕ እገዳ ክግበር ጸዊዓ", + "ለንደን ብዓዲ እንግሊዝ ትርከብ ዓባይ ከተማ እያ።", + "ናበይ አለኻ፧", + "ናይ ፈረንሳይ ፕሬዝዳንት መን እዩ፧", + "ናይ አሜሪካ ዋና ከተማ እንታይ እያ፧", + "ኦባማ መዓስ ተወሊዱ፧", +] diff --git a/spacy/lang/ti/lex_attrs.py b/spacy/lang/ti/lex_attrs.py new file mode 100644 index 000000000..989eb3e91 --- /dev/null +++ b/spacy/lang/ti/lex_attrs.py @@ -0,0 +1,104 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + +_num_words = [ + "ዜሮ", + "ሐደ", + "ክልተ", + "ሰለስተ", + "ኣርባዕተ", + "ሓሙሽተ", + "ሽድሽተ", + "ሸውዓተ", + "ሽሞንተ", + "ትሽዓተ", + "ኣሰርተ", + "ኣሰርተ ሐደ", + "ኣሰርተ ክልተ", + "ኣሰርተ ሰለስተ", + "ኣሰርተ ኣርባዕተ", + "ኣሰርተ ሓሙሽተ", + "ኣሰርተ ሽድሽተ", + "ኣሰርተ ሸውዓተ", + "ኣሰርተ ሽሞንተ", + "ኣሰርተ ትሽዓተ", + "ዕስራ", + "ሰላሳ", + "ኣርብዓ", + "ሃምሳ", + "ስልሳ", + "ሰብዓ", + "ሰማንያ", + "ተስዓ", + "ሚእቲ", + "ሺሕ", + "ሚልዮን", + "ቢልዮን", + "ትሪልዮን", + "ኳድሪልዮን", + "ገጅልዮን", + "ባዝልዮን" +] + +_ordinal_words = [ + "ቀዳማይ", + "ካልኣይ", + "ሳልሳይ", + "ራብኣይ", + "ሓምሻይ", + "ሻድሻይ", + "ሻውዓይ", + "ሻምናይ", + "ዘጠነኛ", + "አስረኛ", + "ኣሰርተ አንደኛ", + "ኣሰርተ ሁለተኛ", + "ኣሰርተ ሶስተኛ", + "ኣሰርተ አራተኛ", + "ኣሰርተ አምስተኛ", + "ኣሰርተ ስድስተኛ", + "ኣሰርተ ሰባተኛ", + "ኣሰርተ ስምንተኛ", + "ኣሰርተ ዘጠነኛ", + "ሃያኛ", + "ሰላሳኛ" + "አርባኛ", + "አምሳኛ", + "ስድሳኛ", + "ሰባኛ", + "ሰማንያኛ", + "ዘጠናኛ", + "መቶኛ", + "ሺኛ", + "ሚሊዮንኛ", + "ቢሊዮንኛ", + "ትሪሊዮንኛ" +] +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + + text_lower = text.lower() + if text_lower in _num_words: + return True + + # Check ordinal number + if text_lower in _ordinal_words: + return True + if text_lower.endswith("ኛ"): + if text_lower[:-2].isdigit(): + return True + + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/ti/punctuation.py b/spacy/lang/ti/punctuation.py new file mode 100644 index 000000000..44b699612 --- /dev/null +++ b/spacy/lang/ti/punctuation.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY +from ..char_classes import UNITS, ALPHA_UPPER + +_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split() + +_suffixes = ( + _list_punct + + LIST_ELLIPSES + + LIST_QUOTES + + [ + r"(?<=[0-9])\+", + # Tigrinya is written from Left-To-Right + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] +) + +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/ti/stop_words.py b/spacy/lang/ti/stop_words.py new file mode 100644 index 000000000..a73fa6936 --- /dev/null +++ b/spacy/lang/ti/stop_words.py @@ -0,0 +1,10 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Stop words +STOP_WORDS = set( + """ +ግን ግና ንስኻ ንስኺ ንስኻትክን ንስኻትኩም ናትካ ናትኪ ናትክን ናትኩም +""".split() +) \ No newline at end of file diff --git a/spacy/lang/ti/tokenizer_exceptions.py b/spacy/lang/ti/tokenizer_exceptions.py new file mode 100644 index 000000000..3cb050ae8 --- /dev/null +++ b/spacy/lang/ti/tokenizer_exceptions.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA + + +_exc = {} + + +for exc_data in [ + {ORTH: "ት/ቤት", LEMMA: "ትምህርት ቤት"}, + {ORTH: "ወ/ሮ", LEMMA: PRON_LEMMA, NORM: "ወይዘሮ"}, + {ORTH: "ወ/ሪ", LEMMA: PRON_LEMMA, NORM: "ወይዘሪት"}, + +]: + _exc[exc_data[ORTH]] = [exc_data] + + +for orth in [ + "ዓ.ም.", + "ኪ.ሜ.", +]: + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 83b0897c8..90a18925b 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -31,6 +31,9 @@ def pytest_runtest_setup(item): def tokenizer(): return get_lang_class("xx").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def am_tokenizer(): + return get_lang_class("am").Defaults.create_tokenizer() @pytest.fixture(scope="session") def ar_tokenizer(): @@ -242,6 +245,9 @@ def th_tokenizer(): pytest.importorskip("pythainlp") return get_lang_class("th").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def ti_tokenizer(): + return get_lang_class("ti").Defaults.create_tokenizer() @pytest.fixture(scope="session") def tr_tokenizer(): diff --git a/spacy/tests/lang/am/__init__.py b/spacy/tests/lang/am/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/am/test_exception.py b/spacy/tests/lang/am/test_exception.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/am/test_text.py b/spacy/tests/lang/am/test_text.py new file mode 100644 index 000000000..be3c44ebf --- /dev/null +++ b/spacy/tests/lang/am/test_text.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.am.lex_attrs import like_num + + +def test_am_tokenizer_handles_long_text(am_tokenizer): + text = """ሆሴ ሙጂካ በበጋ ወቅት በኦክስፎርድ ንግግር አንድያቀርቡ ሲጋበዙ ጭንቅላታቸው "ፈነዳ"። + +“እጅግ ጥንታዊ” የእንግሊዝኛ ተናጋሪ ዩኒቨርስቲ፣ በአስር ሺዎች የሚቆጠሩ ዩሮዎችን ለተማሪዎች በማስተማር የሚያስከፍለው + +እና ከማርጋሬት ታቸር እስከ ስቲቨን ሆኪንግ በአዳራሾቻቸው ውስጥ ንግግር ያደረጉበት የትምህርት ማዕከል፣ በሞንቴቪዴኦ + +በሚገኘው የመንግስት ትምህርት ቤት የሰለጠኑትን የ81 ዓመቱ አዛውንት አገልግሎት ጠየቁ።""" + tokens = am_tokenizer(text) + + assert len(tokens) == 56 + + +@pytest.mark.parametrize( + "text,length", + [ + ("ሆሴ ሙጂካ ለምን ተመረጠ?", 5), + ("“በፍፁም?”", 4), + ("""አዎ! ሆዜ አርካዲዮ ቡንዲያ “እንሂድ” ሲል መለሰ።""", 11), + ("እነሱ በግምት 10ኪ.ሜ. ሮጡ።", 7), + ("እና ከዚያ ለምን...", 4), + ], +) +def test_am_tokenizer_handles_cnts(am_tokenizer, text, length): + tokens = am_tokenizer(text) + assert len(tokens) == length + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10.000", True), + ("1000", True), + ("999,0", True), + ("አንድ", True), + ("ሁለት", True), + ("ትሪሊዮን", True), + ("ውሻ", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(am_tokenizer, text, match): + tokens = am_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match \ No newline at end of file diff --git a/spacy/tests/lang/da/test_noun_chunks.py b/spacy/tests/lang/da/test_noun_chunks.py new file mode 100644 index 000000000..01bbbb21c --- /dev/null +++ b/spacy/tests/lang/da/test_noun_chunks.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +from ...util import get_doc + + +def test_noun_chunks_is_parsed(da_tokenizer): + """Test that noun_chunks raises Value Error for 'da' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + doc = da_tokenizer("Det er en sætning") + doc.is_parsed = False + with pytest.raises(ValueError): + list(doc.noun_chunks) + + + +DA_NP_TEST_EXAMPLES = [ + ( + "Hun elsker at plukker frugt.", + ['PRON', 'VERB', 'PART', 'VERB', 'NOUN', 'PUNCT'], + ['nsubj', 'ROOT', 'mark', 'obj', 'obj', 'punct'], + [1, 0, 1, -2, -1, -4], + ["Hun", "frugt"], + ), + ( + "Påfugle er de smukkeste fugle.", + ['NOUN', 'AUX', 'DET', 'ADJ', 'NOUN', 'PUNCT'], + ['nsubj', 'cop', 'det', 'amod', 'ROOT', 'punct'], + [4, 3, 2, 1, 0, -1], + ["Påfugle", "de smukkeste fugle"], + ), + ( + "Rikke og Jacob Jensen glæder sig til en hyggelig skovtur", + ['PROPN', 'CCONJ', 'PROPN', 'PROPN', 'VERB', 'PRON', 'ADP', 'DET', 'ADJ', 'NOUN'], + ['nsubj', 'cc', 'conj', 'flat', 'ROOT', 'obj', 'case', 'det', 'amod', 'obl'], + [4, 1, -2, -1, 0, -1, 3, 2, 1, -5], + ["Rikke", "Jacob Jensen", "sig", "en hyggelig skovtur"], + ), +] + + +@pytest.mark.parametrize( + "text,pos,deps,heads,expected_noun_chunks", DA_NP_TEST_EXAMPLES +) +def test_da_noun_chunks(da_tokenizer, text, pos, deps, heads, expected_noun_chunks): + tokens = da_tokenizer(text) + + assert len(heads) == len(pos) + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, pos=pos + ) + + noun_chunks = list(doc.noun_chunks) + assert len(noun_chunks) == len(expected_noun_chunks) + for i, np in enumerate(noun_chunks): + assert np.text == expected_noun_chunks[i] diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index 1ff64eff2..6f747c550 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -111,7 +111,15 @@ def test_en_tokenizer_handles_times(en_tokenizer, text): @pytest.mark.parametrize( - "text,norms", [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])] + "text,norms", + [ + ("I'm", ["i", "am"]), + ("shan't", ["shall", "not"]), + ( + "Many factors cause cancer 'cause it is complex", + ["many", "factors", "cause", "cancer", "because", "it", "is", "complex"], + ), + ], ) def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms): tokens = en_tokenizer(text) diff --git a/spacy/tests/lang/ti/__init__.py b/spacy/tests/lang/ti/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ti/test_exception.py b/spacy/tests/lang/ti/test_exception.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ti/test_text.py b/spacy/tests/lang/ti/test_text.py new file mode 100644 index 000000000..9cb220d1e --- /dev/null +++ b/spacy/tests/lang/ti/test_text.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.ti.lex_attrs import like_num + + +def test_ti_tokenizer_handles_long_text(ti_tokenizer): + text = """ቻንስለር ጀርመን ኣንገላ መርከል ኣብታ ሃገር ቁጽሪ መትሓዝቲ ኮቪድ መዓልታዊ ክብረ መዝገብ ድሕሪ ምህራሙ- ጽኑዕ እገዳ ክግበር ጸዊዓ። + +መርከል ሎሚ ንታሕታዋይ ባይቶ ሃገራ ክትገልጽ ከላ፡ ኣብ ወሳኒ ምዕራፍ ቃልሲ ኢና ዘለና-ዳሕራዋይ ማዕበል ካብቲ ቀዳማይ ክገድድ ይኽእል`ዩ ኢላ። + +ትካል ምክልኻል ተላገብቲ ሕማማት ጀርመን፡ ኣብ ዝሓለፈ 24 ሰዓታት ኣብ ምልእቲ ጀርመር 590 ሰባት ብኮቪድ19 ምሟቶም ኣፍሊጡ`ሎ። + +ቻንስለር ኣንጀላ መርከል ኣብ እዋን በዓላት ልደት ስድራቤታት ክተኣኻኸባ ዝፍቀደለን`ኳ እንተኾነ ድሕሪኡ ኣብ ዘሎ ግዜ ግን እቲ እገዳታት ክትግበር ትደሊ።""" + tokens = ti_tokenizer(text) + + assert len(tokens) == 85 + + +@pytest.mark.parametrize( + "text,length", + [ + ("ቻንስለር ጀርመን ኣንገላ መርከል፧", 5), + ("“ስድራቤታት፧”", 4), + ("""ኣብ እዋን በዓላት ልደት ስድራቤታት ክተኣኻኸባ ዝፍቀደለን`ኳ እንተኾነ።""", 9), + ("ብግምት 10ኪ.ሜ. ጎይዩ።", 6), + ("ኣብ ዝሓለፈ 24 ሰዓታት...", 5), + ], +) +def test_ti_tokenizer_handles_cnts(ti_tokenizer, text, length): + tokens = ti_tokenizer(text) + assert len(tokens) == length + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10.000", True), + ("1000", True), + ("999,0", True), + ("ሐደ", True), + ("ክልተ", True), + ("ትሪልዮን", True), + ("ከልቢ", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(ti_tokenizer, text, match): + tokens = ti_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match \ No newline at end of file diff --git a/website/meta/universe.json b/website/meta/universe.json index 4d322be1c..794957d5e 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2670,6 +2670,60 @@ }, "category": ["scientific", "research", "standalone"], "tags": ["Evolutionary Computation", "Grammatical Evolution"] + }, + { + "id": "SpacyDotNet", + "title": "spaCy .NET Wrapper", + "slogan": "SpacyDotNet is a .NET Core compatible wrapper for spaCy, based on Python.NET", + "description": "This projects relies on [Python.NET](http://pythonnet.github.io/) to interop with spaCy. It's not meant to be a complete and exhaustive implementation of all spaCy features and [APIs](https://spacy.io/api). Although it should be enough for basic tasks, it's considered as a starting point if you need to build a complex project using spaCy in .NET Most of the basic features in _Spacy101_ are available. All `Container` classes are present (`Doc`, `Token`, `Span` and `Lexeme`) with their basic properties/methods running and also `Vocab` and `StringStore` in a limited form. Anyway, any developer should be ready to add the missing properties or classes in a very straightforward manner.", + "github": "AMArostegui/SpacyDotNet", + "thumb": "https://raw.githubusercontent.com/AMArostegui/SpacyDotNet/master/cslogo.png", + "code_example": [ + "var spacy = new Spacy();", + "", + "var nlp = spacy.Load(\"en_core_web_sm\");", + "var doc = nlp.GetDocument(\"Apple is looking at buying U.K. startup for $1 billion\");", + "", + "foreach (Token token in doc.Tokens)", + " Console.WriteLine($\"{token.Text} {token.Lemma} {token.PoS} {token.Tag} {token.Dep} {token.Shape} {token.IsAlpha} {token.IsStop}\");", + "", + "Console.WriteLine(\"\");", + "foreach (Span ent in doc.Ents)", + " Console.WriteLine($\"{ent.Text} {ent.StartChar} {ent.EndChar} {ent.Label}\");", + "", + "nlp = spacy.Load(\"en_core_web_md\");", + "var tokens = nlp.GetDocument(\"dog cat banana afskfsd\");", + "", + "Console.WriteLine(\"\");", + "foreach (Token token in tokens.Tokens)", + " Console.WriteLine($\"{token.Text} {token.HasVector} {token.VectorNorm}, {token.IsOov}\");", + "", + "tokens = nlp.GetDocument(\"dog cat banana\");", + "Console.WriteLine(\"\");", + "foreach (Token token1 in tokens.Tokens)", + "{", + " foreach (Token token2 in tokens.Tokens)", + " Console.WriteLine($\"{token1.Text} {token2.Text} {token1.Similarity(token2) }\");", + "}", + "", + "doc = nlp.GetDocument(\"I love coffee\");", + "Console.WriteLine(\"\");", + "Console.WriteLine(doc.Vocab.Strings[\"coffee\"]);", + "Console.WriteLine(doc.Vocab.Strings[3197928453018144401]);", + "", + "Console.WriteLine(\"\");", + "foreach (Token word in doc.Tokens)", + "{", + " var lexeme = doc.Vocab[word.Text];", + " Console.WriteLine($@\"{lexeme.Text} {lexeme.Orth} {lexeme.Shape} {lexeme.Prefix} {lexeme.Suffix} {lexeme.IsAlpha} {lexeme.IsDigit} {lexeme.IsTitle} {lexeme.Lang}\");", + "}" + ], + "code_language": "csharp", + "author": "Antonio Miras", + "author_links": { + "github": "AMArostegui" + }, + "category": ["nonpython"] } ],