mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge branch 'master' into develop
This commit is contained in:
		
						commit
						e2d93e4852
					
				
							
								
								
									
										106
									
								
								.github/contributors/juliamakogon.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/juliamakogon.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,106 @@ | |||
| # spaCy contributor agreement | ||||
| 
 | ||||
| This spaCy Contributor Agreement (**"SCA"**) is based on the | ||||
| [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). | ||||
| The SCA applies to any contribution that you make to any product or project | ||||
| managed by us (the **"project"**), and sets out the intellectual property rights | ||||
| you grant to us in the contributed materials. The term **"us"** shall mean | ||||
| [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term | ||||
| **"you"** shall mean the person or entity identified below. | ||||
| 
 | ||||
| If you agree to be bound by these terms, fill in the information requested | ||||
| below and include the filled-in version with your first pull request, under the | ||||
| folder [`.github/contributors/`](/.github/contributors/). The name of the file | ||||
| should be your GitHub username, with the extension `.md`. For example, the user | ||||
| example_user would create the file `.github/contributors/example_user.md`. | ||||
| 
 | ||||
| Read this agreement carefully before signing. These terms and conditions | ||||
| constitute a binding legal agreement. | ||||
| 
 | ||||
| ## Contributor Agreement | ||||
| 
 | ||||
| 1. The term "contribution" or "contributed materials" means any source code, | ||||
| object code, patch, tool, sample, graphic, specification, manual, | ||||
| documentation, or any other material posted or submitted by you to the project. | ||||
| 
 | ||||
| 2. With respect to any worldwide copyrights, or copyright applications and | ||||
| registrations, in your contribution: | ||||
| 
 | ||||
|     * you hereby assign to us joint ownership, and to the extent that such | ||||
|     assignment is or becomes invalid, ineffective or unenforceable, you hereby | ||||
|     grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, | ||||
|     royalty-free, unrestricted license to exercise all rights under those | ||||
|     copyrights. This includes, at our option, the right to sublicense these same | ||||
|     rights to third parties through multiple levels of sublicensees or other | ||||
|     licensing arrangements; | ||||
| 
 | ||||
|     * you agree that each of us can do all things in relation to your | ||||
|     contribution as if each of us were the sole owners, and if one of us makes | ||||
|     a derivative work of your contribution, the one who makes the derivative | ||||
|     work (or has it made will be the sole owner of that derivative work; | ||||
| 
 | ||||
|     * you agree that you will not assert any moral rights in your contribution | ||||
|     against us, our licensees or transferees; | ||||
| 
 | ||||
|     * you agree that we may register a copyright in your contribution and | ||||
|     exercise all ownership rights associated with it; and | ||||
| 
 | ||||
|     * you agree that neither of us has any duty to consult with, obtain the | ||||
|     consent of, pay or render an accounting to the other for any use or | ||||
|     distribution of your contribution. | ||||
| 
 | ||||
| 3. With respect to any patents you own, or that you can license without payment | ||||
| to any third party, you hereby grant to us a perpetual, irrevocable, | ||||
| non-exclusive, worldwide, no-charge, royalty-free license to: | ||||
| 
 | ||||
|     * make, have made, use, sell, offer to sell, import, and otherwise transfer | ||||
|     your contribution in whole or in part, alone or in combination with or | ||||
|     included in any product, work or materials arising out of the project to | ||||
|     which your contribution was submitted, and | ||||
| 
 | ||||
|     * at our option, to sublicense these same rights to third parties through | ||||
|     multiple levels of sublicensees or other licensing arrangements. | ||||
| 
 | ||||
| 4. Except as set out above, you keep all right, title, and interest in your | ||||
| contribution. The rights that you grant to us under these terms are effective | ||||
| on the date you first submitted a contribution to us, even if your submission | ||||
| took place before the date you sign these terms. | ||||
| 
 | ||||
| 5. You covenant, represent, warrant and agree that: | ||||
| 
 | ||||
|     * Each contribution that you submit is and shall be an original work of | ||||
|     authorship and you can legally grant the rights set out in this SCA; | ||||
| 
 | ||||
|     * to the best of your knowledge, each contribution will not violate any | ||||
|     third party's copyrights, trademarks, patents, or other intellectual | ||||
|     property rights; and | ||||
| 
 | ||||
|     * each contribution shall be in compliance with U.S. export control laws and | ||||
|     other applicable export and import laws. You agree to notify us if you | ||||
|     become aware of any circumstance which would make any of the foregoing | ||||
|     representations inaccurate in any respect. We may publicly disclose your | ||||
|     participation in the project, including the fact that you have signed the SCA. | ||||
| 
 | ||||
| 6. This SCA is governed by the laws of the State of California and applicable | ||||
| U.S. Federal law. Any choice of law rules will not apply. | ||||
| 
 | ||||
| 7. Please place an “x” on one of the applicable statement below. Please do NOT | ||||
| mark both statements: | ||||
| 
 | ||||
|     * [ ] I am signing on behalf of myself as an individual and no other person | ||||
|     or entity, including my employer, has or will have rights with respect to my | ||||
|     contributions. | ||||
| 
 | ||||
|     * [ ] I am signing on behalf of my employer or a legal entity and I have the | ||||
|     actual authority to contractually bind that entity. | ||||
| 
 | ||||
| ## Contributor Details | ||||
| 
 | ||||
| | Field                          | Entry                | | ||||
| |------------------------------- | -------------------- | | ||||
| | Name                           | Julia Makogon        | | ||||
| | Company name (if applicable)   | Semantrum            | | ||||
| | Title or role (if applicable)  |                      | | ||||
| | Date                           | 07.02.2019           | | ||||
| | GitHub username                | juliamakogon         | | ||||
| | Website (optional)             |                      | | ||||
|  | @ -9,7 +9,7 @@ from ...compat import unicode_ | |||
| class RussianLemmatizer(Lemmatizer): | ||||
|     _morph = None | ||||
| 
 | ||||
|     def __init__(self): | ||||
|     def __init__(self, pymorphy2_lang='ru'): | ||||
|         super(RussianLemmatizer, self).__init__() | ||||
|         try: | ||||
|             from pymorphy2 import MorphAnalyzer | ||||
|  | @ -20,7 +20,7 @@ class RussianLemmatizer(Lemmatizer): | |||
|             ) | ||||
| 
 | ||||
|         if RussianLemmatizer._morph is None: | ||||
|             RussianLemmatizer._morph = MorphAnalyzer() | ||||
|             RussianLemmatizer._morph = MorphAnalyzer(lang=pymorphy2_lang) | ||||
| 
 | ||||
|     def __call__(self, string, univ_pos, morphology=None): | ||||
|         univ_pos = self.normalize_univ_pos(univ_pos) | ||||
|  |  | |||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										75
									
								
								spacy/lang/uk/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								spacy/lang/uk/__init__.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,75 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| 
 | ||||
| # uncomment if files are available | ||||
| # from .norm_exceptions import NORM_EXCEPTIONS | ||||
| # from .tag_map import TAG_MAP | ||||
| # from .morph_rules import MORPH_RULES | ||||
| 
 | ||||
| # uncomment if lookup-based lemmatizer is available | ||||
| # from .lemmatizer import LOOKUP | ||||
| # from ...lemmatizerlookup import Lemmatizer | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
| from ...util import update_exc, add_lookups | ||||
| from ...language import Language | ||||
| from ...attrs import LANG, LIKE_NUM, NORM | ||||
| # from .tag_map import TAG_MAP | ||||
| from .lemmatizer import UkrainianLemmatizer | ||||
| 
 | ||||
| 
 | ||||
| # Create a Language subclass | ||||
| # Documentation: https://spacy.io/docs/usage/adding-languages | ||||
| 
 | ||||
| # This file should be placed in spacy/lang/xx (ISO code of language). | ||||
| # Before submitting a pull request, make sure the remove all comments from the | ||||
| # language data files, and run at least the basic tokenizer tests. Simply add the | ||||
| # language ID to the list of languages in spacy/tests/conftest.py to include it | ||||
| # in the basic tokenizer sanity tests. You can optionally add a fixture for the | ||||
| # language's tokenizer and add more specific tests. For more info, see the | ||||
| # tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests | ||||
| 
 | ||||
| 
 | ||||
| class UkrainianDefaults(Language.Defaults): | ||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'uk' # ISO code | ||||
|     # add more norm exception dictionaries here | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|     # overwrite functions for lexical attributes | ||||
|     lex_attr_getters.update(LEX_ATTRS) | ||||
| 
 | ||||
|     # add custom tokenizer exceptions to base exceptions | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
| 
 | ||||
|     # add stop words | ||||
|     stop_words = STOP_WORDS | ||||
| 
 | ||||
|     # if available: add tag map | ||||
|     # tag_map = dict(TAG_MAP) | ||||
| 
 | ||||
|     # if available: add morph rules | ||||
|     # morph_rules = dict(MORPH_RULES) | ||||
| 
 | ||||
|     # if available: add lookup lemmatizer | ||||
|     # @classmethod | ||||
|     # def create_lemmatizer(cls, nlp=None): | ||||
|     #     return Lemmatizer(LOOKUP) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_lemmatizer(cls, nlp=None): | ||||
|         return UkrainianLemmatizer() | ||||
| 
 | ||||
| 
 | ||||
| class Ukrainian(Language): | ||||
|     lang = 'uk' # ISO code | ||||
|     Defaults = UkrainianDefaults # set Defaults to custom language defaults | ||||
| 
 | ||||
| 
 | ||||
| # set default export – this allows the language class to be lazy-loaded | ||||
| __all__ = ['Ukrainian'] | ||||
							
								
								
									
										23
									
								
								spacy/lang/uk/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								spacy/lang/uk/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,23 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.uk.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Ніч на середу буде морозною.", | ||||
|     "Чим кращі книги ти читав, тим гірше спиш.",  # Serhiy Zhadan | ||||
|     "Найстаріші ґудзики, відомі людству, археологи знайшли в долині ріки Інд.", | ||||
|     "Слов'янське слово «Україна» вперше згадується у Київському літописному зводі за Іпатіївським списком під 1187 роком.", # wikipedia | ||||
|     "Де у Києві найсмачніша кава?", | ||||
|     "Від Нижнього озера довгими дерев’яними сходами, над якими синьо й біло горіли маленькі коробочки-ліхтарики, підіймалися до нього двоє стовусів: найкращий друг Вертутій і його дванадцятилітній онук Чублик.", # blyznets_viktor_semenovych/zemlia_svitliachkiv | ||||
|     "Китайський космічний зонд \"Чан'е-4\" вперше в історії здійснив м'яку посадку на зворотному боці Місяця.", | ||||
|     "Коли до губ твоїх лишається півподиху, коли до губ твоїх лишається півкроку – зіниці твої виткані із подиву, в очах у тебе синьо і широко.", # Hryhorij Czubaj | ||||
|     "Дорогу сестру збираю у дорогу, а брати вирішили не брати машину." # homographs | ||||
| ] | ||||
							
								
								
									
										12
									
								
								spacy/lang/uk/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								spacy/lang/uk/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,12 @@ | |||
| from ..ru.lemmatizer import RussianLemmatizer | ||||
| 
 | ||||
| 
 | ||||
| class UkrainianLemmatizer(RussianLemmatizer): | ||||
| 
 | ||||
|     def __init__(self, pymorphy2_lang='ru'): | ||||
|         try: | ||||
|             super(UkrainianLemmatizer, self).__init__(pymorphy2_lang='uk') | ||||
|         except ImportError: | ||||
|             raise ImportError( | ||||
|                 'The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: ' | ||||
|                 'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"') | ||||
							
								
								
									
										42
									
								
								spacy/lang/uk/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								spacy/lang/uk/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,42 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| # import the symbols for the attrs you want to overwrite | ||||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| # Overwriting functions for lexical attributes | ||||
| # Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs | ||||
| # Most of these functions, like is_lower or like_url should be language- | ||||
| # independent. Others, like like_num (which includes both digits and number | ||||
| # words), requires customisation. | ||||
| 
 | ||||
| 
 | ||||
| # Example: check if token resembles a number | ||||
| _num_words = ["більйон", "вісім", "вісімдесят", "вісімнадцять", "вісімсот", "восьмий", "два", "двадцять", "дванадцять", | ||||
|               "двісті", "дев'яносто", "дев'ятнадцять", "дев'ятсот", "дев'ять", "десять", "децильйон", "квадрильйон", | ||||
|               "квінтильйон", "мільйон", "мільярд", "нонильйон", "один", "одинадцять", "октильйон", "п'ятий", | ||||
|               "п'ятисотий", "п'ятнадцять", "п'ятсот", "п'ять", "секстильйон", "септильйон", "сім", "сімдесят", | ||||
|               "сімнадцять", "сімсот", "сорок", "сто", "тисяча", "три", "тридцять", "трильйон", "тринадцять", "триста", | ||||
|               "чотири", "чотириста", "чотирнадцять", "шістдесят", "шістнадцять", "шістсот", "шість"] | ||||
| 
 | ||||
| 
 | ||||
| def like_num(text): | ||||
|     text = text.replace(',', '').replace('.', '') | ||||
|     if text.isdigit(): | ||||
|         return True | ||||
|     if text.count('/') == 1: | ||||
|         num, denom = text.split('/') | ||||
|         if num.isdigit() and denom.isdigit(): | ||||
|             return True | ||||
|     if text in _num_words: | ||||
|         return True | ||||
|     return False | ||||
| 
 | ||||
| 
 | ||||
| # Create dictionary of functions to overwrite. The default lex_attr_getters are | ||||
| # updated with this one, so only the functions defined here are overwritten. | ||||
| 
 | ||||
| LEX_ATTRS = { | ||||
|     LIKE_NUM: like_num | ||||
| } | ||||
							
								
								
									
										404
									
								
								spacy/lang/uk/stop_words.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										404
									
								
								spacy/lang/uk/stop_words.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,404 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| # Add stop words | ||||
| # Documentation: https://spacy.io/docs/usage/adding-languages#stop-words | ||||
| # To improve readability, words should be ordered alphabetically and separated | ||||
| # by spaces and newlines. When adding stop words from an online source, always | ||||
| # include the link in a comment. Make sure to proofread and double-check the | ||||
| # words – lists available online are often known to contain mistakes. | ||||
| 
 | ||||
| 
 | ||||
| STOP_WORDS = set("""а | ||||
| або | ||||
| адже | ||||
| але | ||||
| алло | ||||
| багато | ||||
| без | ||||
| безперервно | ||||
| би | ||||
| більш | ||||
| більше | ||||
| біля | ||||
| близько | ||||
| був | ||||
| буває | ||||
| буде | ||||
| будемо | ||||
| будете | ||||
| будеш | ||||
| буду | ||||
| будуть | ||||
| будь | ||||
| була | ||||
| були | ||||
| було | ||||
| бути | ||||
| бывь | ||||
| в | ||||
| важлива | ||||
| важливе | ||||
| важливий | ||||
| важливі | ||||
| вам | ||||
| вами | ||||
| вас | ||||
| ваш | ||||
| ваша | ||||
| ваше | ||||
| ваші | ||||
| вгорі | ||||
| вгору | ||||
| вдалині | ||||
| вже | ||||
| ви | ||||
| від | ||||
| відсотків | ||||
| він | ||||
| вісім | ||||
| вісімнадцятий | ||||
| вісімнадцять | ||||
| вниз | ||||
| внизу | ||||
| вона | ||||
| вони | ||||
| воно | ||||
| восьмий | ||||
| всього | ||||
| втім | ||||
| г | ||||
| геть | ||||
| говорив | ||||
| говорить | ||||
| давно | ||||
| далеко | ||||
| далі | ||||
| дарма | ||||
| два | ||||
| двадцятий | ||||
| двадцять | ||||
| дванадцятий | ||||
| дванадцять | ||||
| дві | ||||
| двох | ||||
| де | ||||
| дев'ятий | ||||
| дев'ятнадцятий | ||||
| дев'ятнадцять | ||||
| дев'ять | ||||
| декілька | ||||
| день | ||||
| десятий | ||||
| десять | ||||
| дійсно | ||||
| для | ||||
| дня | ||||
| до | ||||
| добре | ||||
| довго | ||||
| доки | ||||
| досить | ||||
| другий | ||||
| дуже | ||||
| же | ||||
| життя | ||||
| з | ||||
| за | ||||
| завжди | ||||
| зазвичай | ||||
| зайнята | ||||
| зайнятий | ||||
| зайняті | ||||
| зайнято | ||||
| занадто | ||||
| зараз | ||||
| зате | ||||
| звичайно | ||||
| звідси | ||||
| звідусіль | ||||
| здається | ||||
| значить | ||||
| знову | ||||
| зовсім | ||||
| ім'я | ||||
| іноді | ||||
| інша | ||||
| інше | ||||
| інший | ||||
| інших | ||||
| інші | ||||
| її | ||||
| їй | ||||
| їх | ||||
| його | ||||
| йому | ||||
| ким | ||||
| кого | ||||
| кожен | ||||
| кожна | ||||
| кожне | ||||
| кожні | ||||
| коли | ||||
| кому | ||||
| краще | ||||
| крейдуючи | ||||
| кругом | ||||
| куди | ||||
| ласка | ||||
| лише | ||||
| люди | ||||
| людина | ||||
| майже | ||||
| мало | ||||
| мати | ||||
| мене | ||||
| мені | ||||
| менш | ||||
| менше | ||||
| ми | ||||
| мимо | ||||
| міг | ||||
| між | ||||
| мій | ||||
| мільйонів | ||||
| мною | ||||
| могти | ||||
| моє | ||||
| мож | ||||
| може | ||||
| можна | ||||
| можно | ||||
| можуть | ||||
| можхо | ||||
| мої | ||||
| мор | ||||
| моя | ||||
| на | ||||
| навіть | ||||
| навіщо | ||||
| навкруги | ||||
| нагорі | ||||
| над | ||||
| назад | ||||
| найбільш | ||||
| нам | ||||
| нами | ||||
| нарешті | ||||
| нас | ||||
| наш | ||||
| наша | ||||
| наше | ||||
| наші | ||||
| не | ||||
| небагато | ||||
| недалеко | ||||
| немає | ||||
| нерідко | ||||
| нещодавно | ||||
| нею | ||||
| нибудь | ||||
| нижче | ||||
| низько | ||||
| ним | ||||
| ними | ||||
| них | ||||
| ні | ||||
| ніби | ||||
| ніколи | ||||
| нікуди | ||||
| нічого | ||||
| ну | ||||
| нх | ||||
| нього | ||||
| о | ||||
| обоє | ||||
| один | ||||
| одинадцятий | ||||
| одинадцять | ||||
| однієї | ||||
| одній | ||||
| одного | ||||
| означає | ||||
| окрім | ||||
| он | ||||
| особливо | ||||
| ось | ||||
| перед | ||||
| перший | ||||
| під | ||||
| пізніше | ||||
| пір | ||||
| по | ||||
| повинно | ||||
| подів | ||||
| поки | ||||
| пора | ||||
| поруч | ||||
| посеред | ||||
| потім | ||||
| потрібно | ||||
| почала | ||||
| прекрасне | ||||
| прекрасно | ||||
| при | ||||
| про | ||||
| просто | ||||
| проте | ||||
| проти | ||||
| п'ятий | ||||
| п'ятнадцятий | ||||
| п'ятнадцять | ||||
| п'ять | ||||
| раз | ||||
| раніше | ||||
| рано | ||||
| раптом | ||||
| рік | ||||
| роки | ||||
| років | ||||
| року | ||||
| сам | ||||
| сама | ||||
| саме | ||||
| самим | ||||
| самими | ||||
| самих | ||||
| самі | ||||
| самій | ||||
| само | ||||
| самого | ||||
| самому | ||||
| саму | ||||
| світу | ||||
| свого | ||||
| своє | ||||
| свої | ||||
| своїй | ||||
| своїх | ||||
| свою | ||||
| сеаой | ||||
| себе | ||||
| сім | ||||
| сімнадцятий | ||||
| сімнадцять | ||||
| сказав | ||||
| сказала | ||||
| сказати | ||||
| скільки | ||||
| скрізь | ||||
| собі | ||||
| собою | ||||
| спасибі | ||||
| спочатку | ||||
| справ | ||||
| став | ||||
| суть | ||||
| сьогодні | ||||
| сьомий | ||||
| т | ||||
| та | ||||
| так | ||||
| така | ||||
| таке | ||||
| такий | ||||
| такі | ||||
| також | ||||
| там | ||||
| твій | ||||
| твоє | ||||
| твоя | ||||
| те | ||||
| тебе | ||||
| теж | ||||
| тепер | ||||
| ти | ||||
| тим | ||||
| тими | ||||
| тисяч | ||||
| тих | ||||
| ті | ||||
| тією | ||||
| тільки | ||||
| тобі | ||||
| тобою | ||||
| того | ||||
| тоді | ||||
| той | ||||
| том | ||||
| тому | ||||
| треба | ||||
| третій | ||||
| три | ||||
| тринадцятий | ||||
| тринадцять | ||||
| трохи | ||||
| ту | ||||
| туди | ||||
| тут | ||||
| у | ||||
| увесь | ||||
| уміти | ||||
| усе | ||||
| усі | ||||
| усім | ||||
| усіма | ||||
| усіх | ||||
| усію | ||||
| усього | ||||
| усьому | ||||
| усю | ||||
| усюди | ||||
| уся | ||||
| хіба | ||||
| хотіти | ||||
| хоч | ||||
| хоча | ||||
| хочеш | ||||
| хто | ||||
| це | ||||
| цей | ||||
| цим | ||||
| цими | ||||
| цих | ||||
| ці | ||||
| цій | ||||
| цього | ||||
| цьому | ||||
| цю | ||||
| ця | ||||
| час | ||||
| частіше | ||||
| часто | ||||
| часу | ||||
| через | ||||
| четвертий | ||||
| чи | ||||
| чим | ||||
| численна | ||||
| численне | ||||
| численний | ||||
| численні | ||||
| чого | ||||
| чому | ||||
| чотири | ||||
| чотирнадцятий | ||||
| чотирнадцять | ||||
| шістнадцятий | ||||
| шістнадцять | ||||
| шість | ||||
| шостий | ||||
| ще | ||||
| що | ||||
| щоб | ||||
| я | ||||
| як | ||||
| яка | ||||
| який | ||||
| яких | ||||
| які | ||||
| якій | ||||
| якого | ||||
| якщо | ||||
| """.split()) | ||||
							
								
								
									
										36
									
								
								spacy/lang/uk/tag_map.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								spacy/lang/uk/tag_map.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,36 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ | ||||
| from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ | ||||
| 
 | ||||
| 
 | ||||
| # Add a tag map | ||||
| # Documentation: https://spacy.io/docs/usage/adding-languages#tag-map | ||||
| # Universal Dependencies: http://universaldependencies.org/u/pos/all.html | ||||
| # The keys of the tag map should be strings in your tag set. The dictionary must | ||||
| # have an entry POS whose value is one of the Universal Dependencies tags. | ||||
| # Optionally, you can also include morphological features or other attributes. | ||||
| 
 | ||||
| 
 | ||||
| TAG_MAP = { | ||||
|     "ADV":      {POS: ADV}, | ||||
|     "NOUN":     {POS: NOUN}, | ||||
|     "ADP":      {POS: ADP}, | ||||
|     "PRON":     {POS: PRON}, | ||||
|     "SCONJ":    {POS: SCONJ}, | ||||
|     "PROPN":    {POS: PROPN}, | ||||
|     "DET":      {POS: DET}, | ||||
|     "SYM":      {POS: SYM}, | ||||
|     "INTJ":     {POS: INTJ}, | ||||
|     "PUNCT":    {POS: PUNCT}, | ||||
|     "NUM":      {POS: NUM}, | ||||
|     "AUX":      {POS: AUX}, | ||||
|     "X":        {POS: X}, | ||||
|     "CONJ":     {POS: CONJ}, | ||||
|     "CCONJ":    {POS: CCONJ}, | ||||
|     "ADJ":      {POS: ADJ}, | ||||
|     "VERB":     {POS: VERB}, | ||||
|     "PART":     {POS: PART}, | ||||
|     "SP":     	{POS: SPACE} | ||||
| } | ||||
							
								
								
									
										38
									
								
								spacy/lang/uk/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								spacy/lang/uk/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,38 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| # import symbols – if you need to use more, add them here | ||||
| from ...symbols import ORTH, LEMMA, POS, NORM, NOUN | ||||
| 
 | ||||
| 
 | ||||
| # Add tokenizer exceptions | ||||
| # Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions | ||||
| # Feel free to use custom logic to generate repetitive exceptions more efficiently. | ||||
| # If an exception is split into more than one token, the ORTH values combined always | ||||
| # need to match the original string. | ||||
| 
 | ||||
| # Exceptions should be added in the following format: | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
| for exc_data in [ | ||||
|     {ORTH: "вул.", LEMMA: "вулиця", NORM: "вулиця", POS: NOUN}, | ||||
|     {ORTH: "ім.", LEMMA: "ім'я", NORM: "імені", POS: NOUN}, | ||||
|     {ORTH: "просп.", LEMMA: "проспект", NORM: "проспект", POS: NOUN}, | ||||
|     {ORTH: "бул.", LEMMA: "бульвар", NORM: "бульвар", POS: NOUN}, | ||||
|     {ORTH: "пров.", LEMMA: "провулок", NORM: "провулок", POS: NOUN}, | ||||
|     {ORTH: "пл.", LEMMA: "площа", NORM: "площа", POS: NOUN}, | ||||
|     {ORTH: "г.", LEMMA: "гора", NORM: "гора", POS: NOUN}, | ||||
|     {ORTH: "п.", LEMMA: "пан", NORM: "пан", POS: NOUN}, | ||||
|     {ORTH: "м.", LEMMA: "місто", NORM: "місто", POS: NOUN}, | ||||
|     {ORTH: "проф.", LEMMA: "професор", NORM: "професор", POS: NOUN}, | ||||
|     {ORTH: "акад.", LEMMA: "академік", NORM: "академік", POS: NOUN}, | ||||
|     {ORTH: "доц.", LEMMA: "доцент", NORM: "доцент", POS: NOUN}, | ||||
|     {ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN}]: | ||||
|     _exc[exc_data[ORTH]] = [exc_data] | ||||
| 
 | ||||
| 
 | ||||
| # To keep things clean and readable, it's recommended to only declare the | ||||
| # TOKENIZER_EXCEPTIONS at the bottom: | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = _exc | ||||
|  | @ -117,6 +117,12 @@ def tr_tokenizer(): | |||
|     return get_lang_class("tr").Defaults.create_tokenizer() | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def uk_tokenizer(): | ||||
|     pymorphy = pytest.importorskip("pymorphy2") | ||||
|     return util.get_lang_class("uk").Defaults.create_tokenizer() | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def ca_tokenizer(): | ||||
|     return get_lang_class("ca").Defaults.create_tokenizer() | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/tests/lang/uk/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/uk/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										128
									
								
								spacy/tests/lang/uk/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										128
									
								
								spacy/tests/lang/uk/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,128 @@ | |||
| # coding: utf-8 | ||||
| """Test that open, closed and paired punctuation is split off correctly.""" | ||||
| 
 | ||||
| 
 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| PUNCT_OPEN = ['(', '[', '{', '*'] | ||||
| PUNCT_CLOSE = [')', ']', '}', '*'] | ||||
| PUNCT_PAIRED = [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["(", "((", "<"]) | ||||
| def test_uk_tokenizer_handles_only_punct(uk_tokenizer, text): | ||||
|     tokens = uk_tokenizer(text) | ||||
|     assert len(tokens) == len(text) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('punct', PUNCT_OPEN) | ||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||
| def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text): | ||||
|     tokens = uk_tokenizer(punct + text) | ||||
|     assert len(tokens) == 2 | ||||
|     assert tokens[0].text == punct | ||||
|     assert tokens[1].text == text | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('punct', PUNCT_CLOSE) | ||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||
| def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text): | ||||
|     tokens = uk_tokenizer(text + punct) | ||||
|     assert len(tokens) == 2 | ||||
|     assert tokens[0].text == text | ||||
|     assert tokens[1].text == punct | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('punct', PUNCT_OPEN) | ||||
| @pytest.mark.parametrize('punct_add', ["`"]) | ||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||
| def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add, text): | ||||
|     tokens = uk_tokenizer(punct + punct_add + text) | ||||
|     assert len(tokens) == 3 | ||||
|     assert tokens[0].text == punct | ||||
|     assert tokens[1].text == punct_add | ||||
|     assert tokens[2].text == text | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('punct', PUNCT_CLOSE) | ||||
| @pytest.mark.parametrize('punct_add', ["'"]) | ||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||
| def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add, text): | ||||
|     tokens = uk_tokenizer(text + punct + punct_add) | ||||
|     assert len(tokens) == 3 | ||||
|     assert tokens[0].text == text | ||||
|     assert tokens[1].text == punct | ||||
|     assert tokens[2].text == punct_add | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('punct', PUNCT_OPEN) | ||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||
| def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text): | ||||
|     tokens = uk_tokenizer(punct + punct + punct + text) | ||||
|     assert len(tokens) == 4 | ||||
|     assert tokens[0].text == punct | ||||
|     assert tokens[3].text == text | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('punct', PUNCT_CLOSE) | ||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||
| def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text): | ||||
|     tokens = uk_tokenizer(text + punct + punct + punct) | ||||
|     assert len(tokens) == 4 | ||||
|     assert tokens[0].text == text | ||||
|     assert tokens[1].text == punct | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["'Тест"]) | ||||
| def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text): | ||||
|     tokens = uk_tokenizer(text) | ||||
|     assert len(tokens) == 2 | ||||
|     assert tokens[0].text == "'" | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["Тест''"]) | ||||
| def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text): | ||||
|     tokens = uk_tokenizer(text) | ||||
|     assert len(tokens) == 2 | ||||
|     tokens_punct = uk_tokenizer("''") | ||||
|     assert len(tokens_punct) == 1 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) | ||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||
| def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open, | ||||
|                                               punct_close, text): | ||||
|     tokens = uk_tokenizer(punct_open + text + punct_close) | ||||
|     assert len(tokens) == 3 | ||||
|     assert tokens[0].text == punct_open | ||||
|     assert tokens[1].text == text | ||||
|     assert tokens[2].text == punct_close | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) | ||||
| @pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")]) | ||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||
| def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close, | ||||
|                                      punct_open2, punct_close2, text): | ||||
|     tokens = uk_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2) | ||||
|     assert len(tokens) == 5 | ||||
|     assert tokens[0].text == punct_open2 | ||||
|     assert tokens[1].text == punct_open | ||||
|     assert tokens[2].text == text | ||||
|     assert tokens[3].text == punct_close | ||||
|     assert tokens[4].text == punct_close2 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."]) | ||||
| def test_uk_tokenizer_splits_trailing_dot(uk_tokenizer, text): | ||||
|     tokens = uk_tokenizer(text) | ||||
|     assert tokens[1].text == "." | ||||
| 
 | ||||
| 
 | ||||
| def test_uk_tokenizer_splits_bracket_period(uk_tokenizer): | ||||
|     text = "(Раз, два, три, проверка)." | ||||
|     tokens = uk_tokenizer(text) | ||||
|     assert tokens[len(tokens) - 1].text == "." | ||||
							
								
								
									
										18
									
								
								spacy/tests/lang/uk/test_tokenizer_exc.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/tests/lang/uk/test_tokenizer_exc.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| # coding: utf-8 | ||||
| """Test that tokenizer exceptions are parsed correctly.""" | ||||
| 
 | ||||
| 
 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text,norms,lemmas', [("ім.", ["імені"], ["ім'я"]), | ||||
|                                         ("проф.", ["професор"], ["професор"])]) | ||||
| def test_uk_tokenizer_abbrev_exceptions(uk_tokenizer, text, norms, lemmas): | ||||
|     tokens = uk_tokenizer(text) | ||||
|     assert len(tokens) == 1 | ||||
|     assert [token.norm_ for token in tokens] == norms | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user