diff --git a/.github/contributors/Pavle992.md b/.github/contributors/Pavle992.md new file mode 100644 index 000000000..74ba69f46 --- /dev/null +++ b/.github/contributors/Pavle992.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Pavle Vidanović | +| Company name (if applicable) | - | +| Title or role (if applicable) | - | +| Date | August 4, 2019 | +| GitHub username | Pavle992 | +| Website (optional) | - | diff --git a/spacy/lang/rs/__init__.py b/spacy/lang/rs/__init__.py new file mode 100644 index 000000000..4228fb193 --- /dev/null +++ b/spacy/lang/rs/__init__.py @@ -0,0 +1,28 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups + + +class SerbianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "rs" + lex_attr_getters[NORM] = add_lookups( + Language.Defaults.lex_attr_getters[NORM], BASE_NORMS + ) + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) + stop_words = STOP_WORDS + + +class Serbian(Language): + lang = "rs" + Defaults = SerbianDefaults + + +__all__ = ["Serbian"] diff --git a/spacy/lang/rs/stop_words.py b/spacy/lang/rs/stop_words.py new file mode 100644 index 000000000..9712327f8 --- /dev/null +++ b/spacy/lang/rs/stop_words.py @@ -0,0 +1,397 @@ +# coding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set( + """ +а +авај +ако +ал +али +арх +ау +ах +аха +ај +бар +би +била +били +било +бисмо +бисте +бих +бијасмо +бијасте +бијах +бијаху +бијаше +биће +близу +број +брр +буде +будимо +будите +буду +будући +бум +бућ +вам +вама +вас +ваша +ваше +вашим +вашима +ваљда +веома +вероватно +већ +већина +ви +видео +више +врло +врх +га +где +гиц +год +горе +гђекоје +да +дакле +дана +данас +дај +два +де +дедер +делимице +делимично +дем +до +добар +добити +довечер +докле +доле +донекле +досад +доскоро +дотад +дотле +дошао +доћи +другамо +другде +други +е +ево +ено +ето +ех +ехе +еј +желела +желеле +желели +желело +желех +желећи +жели +за +заиста +зар +затим +зато +захвалити +зашто +збиља +зимус +знати +зум +и +иде +из +изван +изволи +између +изнад +икада +икакав +икаква +икакве +икакви +икаквим +икаквима +икаквих +икакво +икаквог +икаквога +икаквом +икаквоме +икаквој +или +им +има +имам +имао +испод +их +ију +ићи +кад +када +кога +којекакав +којима +коју +кришом +лани +ли +мали +мањи +ме +мене +мени +ми +мимо +мисли +много +могу +мора +морао +мој +моја +моје +моји +моју +моћи +му +на +над +након +нам +нама +нас +наша +наше +нашег +наши +наћи +не +негде +нека +некад +неке +неког +неку +нема +немам +неко +неће +нећемо +нећете +нећеш +нећу +ни +никада +никога +никоје +никоји +никоју +нисам +ниси +нисте +нису +ништа +ниједан +но +о +ова +овако +овамо +овај +овде +ове +овим +овима +ово +овој +од +одмах +око +около +он +онај +оне +оним +онима +оном +оној +ону +осим +остали +отишао +па +пак +питати +по +поводом +под +подаље +пожељан +пожељна +поиздаље +поименце +понекад +попреко +поред +после +потаман +потрбушке +поуздано +почетак +поједини +правити +први +преко +према +прије +пут +пљус +радије +с +са +сав +сада +сам +само +сасвим +сва +сваки +сви +свим +свог +свом +свој +своја +своје +своју +сву +свугде +се +себе +себи +си +смети +смо +ствар +стварно +сте +су +сутра +та +таèно +тако +такође +тамо +твој +твоја +твоје +твоји +твоју +те +тебе +теби +ти +тима +то +томе +тој +ту +у +увек +увијек +уз +уза +узалуд +уздуж +узети +умало +унутра +употребити +упркос +учинио +учинити +хало +хвала +хеј +хм +хоп +хоће +хоћемо +хоћете +хоћеш +хоћу +хтедосте +хтедох +хтедоше +хтела +хтеле +хтели +хтео +хтејасмо +хтејасте +хтејаху +хура +често +чијем +чији +чијим +чијима +шиц +штагод +што +штогод +ја +је +један +једини +једна +једне +једни +једно +једном +јер +јесам +јеси +јесмо +јесу +јим +јој +ју +јуче +његова +његово +њезин +њезина +њезино +њему +њен +њим +њима +њихова +њихово +њој +њу +ће +ћемо +ћете +ћеш +ћу +""".split() +) diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 9cd0a78c3..f6938c252 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -10,8 +10,8 @@ from spacy.util import get_lang_class # excluded: ja, ru, th, uk, vi, zh LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", - "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk", - "sl", "sq", "sv", "ta", "te", "tl", "tr", "tt", "ur"] + "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "rs", "si", + "sk", "sl", "sq", "sv", "ta", "te", "tl", "tr", "tt", "ur"] # fmt: on