From 604acb6ace9991f3be457a11de4f8ffa40f06450 Mon Sep 17 00:00:00 2001 From: estr4ng7d Date: Fri, 24 May 2019 05:29:42 -0700 Subject: [PATCH 1/4] Marathi Language Support (#3767) * Adding Marathi language details and folder to it * Adding few changes and running tests * Adding few changes and running tests * Update __init__.py mh -> mr * Rename spacy/lang/mh/__init__.py to spacy/lang/mr/__init__.py * mh -> mr --- .github/contributors/estr4ng7d.md | 106 ++++++++++++++++ spacy/lang/mr/__init__.py | 20 +++ spacy/lang/mr/stop_words.py | 196 ++++++++++++++++++++++++++++++ 3 files changed, 322 insertions(+) create mode 100644 .github/contributors/estr4ng7d.md create mode 100644 spacy/lang/mr/__init__.py create mode 100644 spacy/lang/mr/stop_words.py diff --git a/.github/contributors/estr4ng7d.md b/.github/contributors/estr4ng7d.md new file mode 100644 index 000000000..35c095c47 --- /dev/null +++ b/.github/contributors/estr4ng7d.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Amey Baviskar | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 21-May-2019 | +| GitHub username | estr4ng7d | +| Website (optional) | | diff --git a/spacy/lang/mr/__init__.py b/spacy/lang/mr/__init__.py new file mode 100644 index 000000000..538540935 --- /dev/null +++ b/spacy/lang/mr/__init__.py @@ -0,0 +1,20 @@ +#coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class MarathiDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "mr" + stop_words = STOP_WORDS + + +class Marathi(Language): + lang = "mr" + Defaults = MarathiDefaults + + +__all__ = ["Marathi"] diff --git a/spacy/lang/mr/stop_words.py b/spacy/lang/mr/stop_words.py new file mode 100644 index 000000000..0b0cd035d --- /dev/null +++ b/spacy/lang/mr/stop_words.py @@ -0,0 +1,196 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json +STOP_WORDS = set( + """ +न +अतरी +तो +हें +तें +कां +आणि +जें +जे +मग +ते +मी +जो +परी +गा +हे +ऐसें +आतां +नाहीं +तेथ +हा +तया +असे +म्हणे +काय +कीं +जैसें +तंव +तूं +होय +जैसा +आहे +पैं +तैसा +जरी +म्हणोनि +एक +ऐसा +जी +ना +मज +एथ +या +जेथ +जया +तुज +तेणें +तैं +पां +असो +करी +ऐसी +येणें +जाहला +तेंचि +आघवें +होती +कांहीं +होऊनि +एकें +मातें +ठायीं +ये +सकळ +केलें +जेणें +जाण +जैसी +होये +जेवीं +एऱ्हवीं +मीचि +किरीटी +दिसे +देवा +हो +तरि +कीजे +तैसे +आपण +तिये +कर्म +नोहे +इये +पडे +माझें +तैसी +लागे +नाना +जंव +कीर +अधिक +अनेक +अशी +असलयाचे +असलेल्या +असा +असून +असे +आज +आणि +आता +आपल्या +आला +आली +आले +आहे +आहेत +एक +एका +कमी +करणयात +करून +का +काम +काय +काही +किवा +की +केला +केली +केले +कोटी +गेल्या +घेऊन +जात +झाला +झाली +झाले +झालेल्या +टा +तर +तरी +तसेच +ता +ती +तीन +ते +तो +त्या +त्याचा +त्याची +त्याच्या +त्याना +त्यानी +त्यामुळे +त्री +दिली +दोन +न +पण +पम +परयतन +पाटील +म +मात्र +माहिती +मी +मुबी +म्हणजे +म्हणाले +म्हणून +या +याचा +याची +याच्या +याना +यानी +येणार +येत +येथील +येथे +लाख +व +व्यकत +सर्व +सागित्ले +सुरू +हजार +हा +ही +हे +होणार +होत +होता +होती +होते +""".split() +) From ed7be3f64cb9208fee602128c7a28ded3b3677b0 Mon Sep 17 00:00:00 2001 From: Ujwal Narayan <31547494+ujwal-narayan@users.noreply.github.com> Date: Mon, 27 May 2019 15:22:52 +0530 Subject: [PATCH 2/4] Update norm_exceptions.py (#3778) * Update norm_exceptions.py Extended the Currency set to include Franc, Indian Rupee, Bangladeshi Taka, Korean Won, Mexican Dollar, and Egyptian Pound * Fix formatting [ci skip] --- spacy/lang/norm_exceptions.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/spacy/lang/norm_exceptions.py b/spacy/lang/norm_exceptions.py index 8766e2815..341967a78 100644 --- a/spacy/lang/norm_exceptions.py +++ b/spacy/lang/norm_exceptions.py @@ -53,5 +53,11 @@ BASE_NORMS = { "US$": "$", "C$": "$", "A$": "$", - "₺" : "$", + "₺": "$", + "₹": "$", + "৳": "$", + "₩": "$", + "Mex$": "$", + "₣": "$", + "E£": "$", } From a8416c46f74a6a9d159090ad6dbc644cd6d40e92 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 28 May 2019 17:11:39 +0200 Subject: [PATCH 3/4] Use string name in setup.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hopefully this will trick GitHub's parser into recognising it as a Python package and show us the dependents / "used by" statistics 🤞 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2c05f8d70..c440b016f 100755 --- a/setup.py +++ b/setup.py @@ -209,7 +209,7 @@ def setup_package(): generate_cython(root, "spacy") setup( - name=about["__title__"], + name="spacy", zip_safe=False, packages=PACKAGES, package_data=PACKAGE_DATA, From 89379a7fa45f94bce4945284a7781eaaa7bc06ff Mon Sep 17 00:00:00 2001 From: mak <9056896+maknotavailable@users.noreply.github.com> Date: Wed, 29 May 2019 09:51:55 +0100 Subject: [PATCH 4/4] Corrected example model URL in requirements.txt (#3786) The URL used to show how to add a model to the requirements.txt had the old release path (excl. explosion). --- website/docs/usage/models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 1dde6f94b..5df4ab458 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -326,7 +326,7 @@ URLs. ```text ### requirements.txt spacy>=2.0.0,<3.0.0 -https://github.com/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm ``` Specifying `#egg=` with the package name tells pip which package to expect from