mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge branch 'master' into develop
This commit is contained in:
		
						commit
						e2d93e4852
					
				
							
								
								
									
										106
									
								
								.github/contributors/juliamakogon.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/juliamakogon.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,106 @@ | ||||||
|  | # spaCy contributor agreement | ||||||
|  | 
 | ||||||
|  | This spaCy Contributor Agreement (**"SCA"**) is based on the | ||||||
|  | [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). | ||||||
|  | The SCA applies to any contribution that you make to any product or project | ||||||
|  | managed by us (the **"project"**), and sets out the intellectual property rights | ||||||
|  | you grant to us in the contributed materials. The term **"us"** shall mean | ||||||
|  | [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term | ||||||
|  | **"you"** shall mean the person or entity identified below. | ||||||
|  | 
 | ||||||
|  | If you agree to be bound by these terms, fill in the information requested | ||||||
|  | below and include the filled-in version with your first pull request, under the | ||||||
|  | folder [`.github/contributors/`](/.github/contributors/). The name of the file | ||||||
|  | should be your GitHub username, with the extension `.md`. For example, the user | ||||||
|  | example_user would create the file `.github/contributors/example_user.md`. | ||||||
|  | 
 | ||||||
|  | Read this agreement carefully before signing. These terms and conditions | ||||||
|  | constitute a binding legal agreement. | ||||||
|  | 
 | ||||||
|  | ## Contributor Agreement | ||||||
|  | 
 | ||||||
|  | 1. The term "contribution" or "contributed materials" means any source code, | ||||||
|  | object code, patch, tool, sample, graphic, specification, manual, | ||||||
|  | documentation, or any other material posted or submitted by you to the project. | ||||||
|  | 
 | ||||||
|  | 2. With respect to any worldwide copyrights, or copyright applications and | ||||||
|  | registrations, in your contribution: | ||||||
|  | 
 | ||||||
|  |     * you hereby assign to us joint ownership, and to the extent that such | ||||||
|  |     assignment is or becomes invalid, ineffective or unenforceable, you hereby | ||||||
|  |     grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, | ||||||
|  |     royalty-free, unrestricted license to exercise all rights under those | ||||||
|  |     copyrights. This includes, at our option, the right to sublicense these same | ||||||
|  |     rights to third parties through multiple levels of sublicensees or other | ||||||
|  |     licensing arrangements; | ||||||
|  | 
 | ||||||
|  |     * you agree that each of us can do all things in relation to your | ||||||
|  |     contribution as if each of us were the sole owners, and if one of us makes | ||||||
|  |     a derivative work of your contribution, the one who makes the derivative | ||||||
|  |     work (or has it made will be the sole owner of that derivative work; | ||||||
|  | 
 | ||||||
|  |     * you agree that you will not assert any moral rights in your contribution | ||||||
|  |     against us, our licensees or transferees; | ||||||
|  | 
 | ||||||
|  |     * you agree that we may register a copyright in your contribution and | ||||||
|  |     exercise all ownership rights associated with it; and | ||||||
|  | 
 | ||||||
|  |     * you agree that neither of us has any duty to consult with, obtain the | ||||||
|  |     consent of, pay or render an accounting to the other for any use or | ||||||
|  |     distribution of your contribution. | ||||||
|  | 
 | ||||||
|  | 3. With respect to any patents you own, or that you can license without payment | ||||||
|  | to any third party, you hereby grant to us a perpetual, irrevocable, | ||||||
|  | non-exclusive, worldwide, no-charge, royalty-free license to: | ||||||
|  | 
 | ||||||
|  |     * make, have made, use, sell, offer to sell, import, and otherwise transfer | ||||||
|  |     your contribution in whole or in part, alone or in combination with or | ||||||
|  |     included in any product, work or materials arising out of the project to | ||||||
|  |     which your contribution was submitted, and | ||||||
|  | 
 | ||||||
|  |     * at our option, to sublicense these same rights to third parties through | ||||||
|  |     multiple levels of sublicensees or other licensing arrangements. | ||||||
|  | 
 | ||||||
|  | 4. Except as set out above, you keep all right, title, and interest in your | ||||||
|  | contribution. The rights that you grant to us under these terms are effective | ||||||
|  | on the date you first submitted a contribution to us, even if your submission | ||||||
|  | took place before the date you sign these terms. | ||||||
|  | 
 | ||||||
|  | 5. You covenant, represent, warrant and agree that: | ||||||
|  | 
 | ||||||
|  |     * Each contribution that you submit is and shall be an original work of | ||||||
|  |     authorship and you can legally grant the rights set out in this SCA; | ||||||
|  | 
 | ||||||
|  |     * to the best of your knowledge, each contribution will not violate any | ||||||
|  |     third party's copyrights, trademarks, patents, or other intellectual | ||||||
|  |     property rights; and | ||||||
|  | 
 | ||||||
|  |     * each contribution shall be in compliance with U.S. export control laws and | ||||||
|  |     other applicable export and import laws. You agree to notify us if you | ||||||
|  |     become aware of any circumstance which would make any of the foregoing | ||||||
|  |     representations inaccurate in any respect. We may publicly disclose your | ||||||
|  |     participation in the project, including the fact that you have signed the SCA. | ||||||
|  | 
 | ||||||
|  | 6. This SCA is governed by the laws of the State of California and applicable | ||||||
|  | U.S. Federal law. Any choice of law rules will not apply. | ||||||
|  | 
 | ||||||
|  | 7. Please place an “x” on one of the applicable statement below. Please do NOT | ||||||
|  | mark both statements: | ||||||
|  | 
 | ||||||
|  |     * [ ] I am signing on behalf of myself as an individual and no other person | ||||||
|  |     or entity, including my employer, has or will have rights with respect to my | ||||||
|  |     contributions. | ||||||
|  | 
 | ||||||
|  |     * [ ] I am signing on behalf of my employer or a legal entity and I have the | ||||||
|  |     actual authority to contractually bind that entity. | ||||||
|  | 
 | ||||||
|  | ## Contributor Details | ||||||
|  | 
 | ||||||
|  | | Field                          | Entry                | | ||||||
|  | |------------------------------- | -------------------- | | ||||||
|  | | Name                           | Julia Makogon        | | ||||||
|  | | Company name (if applicable)   | Semantrum            | | ||||||
|  | | Title or role (if applicable)  |                      | | ||||||
|  | | Date                           | 07.02.2019           | | ||||||
|  | | GitHub username                | juliamakogon         | | ||||||
|  | | Website (optional)             |                      | | ||||||
|  | @ -9,7 +9,7 @@ from ...compat import unicode_ | ||||||
| class RussianLemmatizer(Lemmatizer): | class RussianLemmatizer(Lemmatizer): | ||||||
|     _morph = None |     _morph = None | ||||||
| 
 | 
 | ||||||
|     def __init__(self): |     def __init__(self, pymorphy2_lang='ru'): | ||||||
|         super(RussianLemmatizer, self).__init__() |         super(RussianLemmatizer, self).__init__() | ||||||
|         try: |         try: | ||||||
|             from pymorphy2 import MorphAnalyzer |             from pymorphy2 import MorphAnalyzer | ||||||
|  | @ -20,7 +20,7 @@ class RussianLemmatizer(Lemmatizer): | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|         if RussianLemmatizer._morph is None: |         if RussianLemmatizer._morph is None: | ||||||
|             RussianLemmatizer._morph = MorphAnalyzer() |             RussianLemmatizer._morph = MorphAnalyzer(lang=pymorphy2_lang) | ||||||
| 
 | 
 | ||||||
|     def __call__(self, string, univ_pos, morphology=None): |     def __call__(self, string, univ_pos, morphology=None): | ||||||
|         univ_pos = self.normalize_univ_pos(univ_pos) |         univ_pos = self.normalize_univ_pos(univ_pos) | ||||||
|  |  | ||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										75
									
								
								spacy/lang/uk/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								spacy/lang/uk/__init__.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,75 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
|  | from .stop_words import STOP_WORDS | ||||||
|  | from .lex_attrs import LEX_ATTRS | ||||||
|  | 
 | ||||||
|  | # uncomment if files are available | ||||||
|  | # from .norm_exceptions import NORM_EXCEPTIONS | ||||||
|  | # from .tag_map import TAG_MAP | ||||||
|  | # from .morph_rules import MORPH_RULES | ||||||
|  | 
 | ||||||
|  | # uncomment if lookup-based lemmatizer is available | ||||||
|  | # from .lemmatizer import LOOKUP | ||||||
|  | # from ...lemmatizerlookup import Lemmatizer | ||||||
|  | 
 | ||||||
|  | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
|  | from ..norm_exceptions import BASE_NORMS | ||||||
|  | from ...util import update_exc, add_lookups | ||||||
|  | from ...language import Language | ||||||
|  | from ...attrs import LANG, LIKE_NUM, NORM | ||||||
|  | # from .tag_map import TAG_MAP | ||||||
|  | from .lemmatizer import UkrainianLemmatizer | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Create a Language subclass | ||||||
|  | # Documentation: https://spacy.io/docs/usage/adding-languages | ||||||
|  | 
 | ||||||
|  | # This file should be placed in spacy/lang/xx (ISO code of language). | ||||||
|  | # Before submitting a pull request, make sure the remove all comments from the | ||||||
|  | # language data files, and run at least the basic tokenizer tests. Simply add the | ||||||
|  | # language ID to the list of languages in spacy/tests/conftest.py to include it | ||||||
|  | # in the basic tokenizer sanity tests. You can optionally add a fixture for the | ||||||
|  | # language's tokenizer and add more specific tests. For more info, see the | ||||||
|  | # tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class UkrainianDefaults(Language.Defaults): | ||||||
|  |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|  |     lex_attr_getters[LANG] = lambda text: 'uk' # ISO code | ||||||
|  |     # add more norm exception dictionaries here | ||||||
|  |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
|  | 
 | ||||||
|  |     # overwrite functions for lexical attributes | ||||||
|  |     lex_attr_getters.update(LEX_ATTRS) | ||||||
|  | 
 | ||||||
|  |     # add custom tokenizer exceptions to base exceptions | ||||||
|  |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|  | 
 | ||||||
|  |     # add stop words | ||||||
|  |     stop_words = STOP_WORDS | ||||||
|  | 
 | ||||||
|  |     # if available: add tag map | ||||||
|  |     # tag_map = dict(TAG_MAP) | ||||||
|  | 
 | ||||||
|  |     # if available: add morph rules | ||||||
|  |     # morph_rules = dict(MORPH_RULES) | ||||||
|  | 
 | ||||||
|  |     # if available: add lookup lemmatizer | ||||||
|  |     # @classmethod | ||||||
|  |     # def create_lemmatizer(cls, nlp=None): | ||||||
|  |     #     return Lemmatizer(LOOKUP) | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def create_lemmatizer(cls, nlp=None): | ||||||
|  |         return UkrainianLemmatizer() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Ukrainian(Language): | ||||||
|  |     lang = 'uk' # ISO code | ||||||
|  |     Defaults = UkrainianDefaults # set Defaults to custom language defaults | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # set default export – this allows the language class to be lazy-loaded | ||||||
|  | __all__ = ['Ukrainian'] | ||||||
							
								
								
									
										23
									
								
								spacy/lang/uk/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								spacy/lang/uk/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,23 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | Example sentences to test spaCy and its language models. | ||||||
|  | 
 | ||||||
|  | >>> from spacy.lang.uk.examples import sentences | ||||||
|  | >>> docs = nlp.pipe(sentences) | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | sentences = [ | ||||||
|  |     "Ніч на середу буде морозною.", | ||||||
|  |     "Чим кращі книги ти читав, тим гірше спиш.",  # Serhiy Zhadan | ||||||
|  |     "Найстаріші ґудзики, відомі людству, археологи знайшли в долині ріки Інд.", | ||||||
|  |     "Слов'янське слово «Україна» вперше згадується у Київському літописному зводі за Іпатіївським списком під 1187 роком.", # wikipedia | ||||||
|  |     "Де у Києві найсмачніша кава?", | ||||||
|  |     "Від Нижнього озера довгими дерев’яними сходами, над якими синьо й біло горіли маленькі коробочки-ліхтарики, підіймалися до нього двоє стовусів: найкращий друг Вертутій і його дванадцятилітній онук Чублик.", # blyznets_viktor_semenovych/zemlia_svitliachkiv | ||||||
|  |     "Китайський космічний зонд \"Чан'е-4\" вперше в історії здійснив м'яку посадку на зворотному боці Місяця.", | ||||||
|  |     "Коли до губ твоїх лишається півподиху, коли до губ твоїх лишається півкроку – зіниці твої виткані із подиву, в очах у тебе синьо і широко.", # Hryhorij Czubaj | ||||||
|  |     "Дорогу сестру збираю у дорогу, а брати вирішили не брати машину." # homographs | ||||||
|  | ] | ||||||
							
								
								
									
										12
									
								
								spacy/lang/uk/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								spacy/lang/uk/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,12 @@ | ||||||
|  | from ..ru.lemmatizer import RussianLemmatizer | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class UkrainianLemmatizer(RussianLemmatizer): | ||||||
|  | 
 | ||||||
|  |     def __init__(self, pymorphy2_lang='ru'): | ||||||
|  |         try: | ||||||
|  |             super(UkrainianLemmatizer, self).__init__(pymorphy2_lang='uk') | ||||||
|  |         except ImportError: | ||||||
|  |             raise ImportError( | ||||||
|  |                 'The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: ' | ||||||
|  |                 'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"') | ||||||
							
								
								
									
										42
									
								
								spacy/lang/uk/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								spacy/lang/uk/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,42 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | # import the symbols for the attrs you want to overwrite | ||||||
|  | from ...attrs import LIKE_NUM | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Overwriting functions for lexical attributes | ||||||
|  | # Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs | ||||||
|  | # Most of these functions, like is_lower or like_url should be language- | ||||||
|  | # independent. Others, like like_num (which includes both digits and number | ||||||
|  | # words), requires customisation. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Example: check if token resembles a number | ||||||
|  | _num_words = ["більйон", "вісім", "вісімдесят", "вісімнадцять", "вісімсот", "восьмий", "два", "двадцять", "дванадцять", | ||||||
|  |               "двісті", "дев'яносто", "дев'ятнадцять", "дев'ятсот", "дев'ять", "десять", "децильйон", "квадрильйон", | ||||||
|  |               "квінтильйон", "мільйон", "мільярд", "нонильйон", "один", "одинадцять", "октильйон", "п'ятий", | ||||||
|  |               "п'ятисотий", "п'ятнадцять", "п'ятсот", "п'ять", "секстильйон", "септильйон", "сім", "сімдесят", | ||||||
|  |               "сімнадцять", "сімсот", "сорок", "сто", "тисяча", "три", "тридцять", "трильйон", "тринадцять", "триста", | ||||||
|  |               "чотири", "чотириста", "чотирнадцять", "шістдесят", "шістнадцять", "шістсот", "шість"] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def like_num(text): | ||||||
|  |     text = text.replace(',', '').replace('.', '') | ||||||
|  |     if text.isdigit(): | ||||||
|  |         return True | ||||||
|  |     if text.count('/') == 1: | ||||||
|  |         num, denom = text.split('/') | ||||||
|  |         if num.isdigit() and denom.isdigit(): | ||||||
|  |             return True | ||||||
|  |     if text in _num_words: | ||||||
|  |         return True | ||||||
|  |     return False | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Create dictionary of functions to overwrite. The default lex_attr_getters are | ||||||
|  | # updated with this one, so only the functions defined here are overwritten. | ||||||
|  | 
 | ||||||
|  | LEX_ATTRS = { | ||||||
|  |     LIKE_NUM: like_num | ||||||
|  | } | ||||||
							
								
								
									
										404
									
								
								spacy/lang/uk/stop_words.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										404
									
								
								spacy/lang/uk/stop_words.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,404 @@ | ||||||
|  | # encoding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Add stop words | ||||||
|  | # Documentation: https://spacy.io/docs/usage/adding-languages#stop-words | ||||||
|  | # To improve readability, words should be ordered alphabetically and separated | ||||||
|  | # by spaces and newlines. When adding stop words from an online source, always | ||||||
|  | # include the link in a comment. Make sure to proofread and double-check the | ||||||
|  | # words – lists available online are often known to contain mistakes. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | STOP_WORDS = set("""а | ||||||
|  | або | ||||||
|  | адже | ||||||
|  | але | ||||||
|  | алло | ||||||
|  | багато | ||||||
|  | без | ||||||
|  | безперервно | ||||||
|  | би | ||||||
|  | більш | ||||||
|  | більше | ||||||
|  | біля | ||||||
|  | близько | ||||||
|  | був | ||||||
|  | буває | ||||||
|  | буде | ||||||
|  | будемо | ||||||
|  | будете | ||||||
|  | будеш | ||||||
|  | буду | ||||||
|  | будуть | ||||||
|  | будь | ||||||
|  | була | ||||||
|  | були | ||||||
|  | було | ||||||
|  | бути | ||||||
|  | бывь | ||||||
|  | в | ||||||
|  | важлива | ||||||
|  | важливе | ||||||
|  | важливий | ||||||
|  | важливі | ||||||
|  | вам | ||||||
|  | вами | ||||||
|  | вас | ||||||
|  | ваш | ||||||
|  | ваша | ||||||
|  | ваше | ||||||
|  | ваші | ||||||
|  | вгорі | ||||||
|  | вгору | ||||||
|  | вдалині | ||||||
|  | вже | ||||||
|  | ви | ||||||
|  | від | ||||||
|  | відсотків | ||||||
|  | він | ||||||
|  | вісім | ||||||
|  | вісімнадцятий | ||||||
|  | вісімнадцять | ||||||
|  | вниз | ||||||
|  | внизу | ||||||
|  | вона | ||||||
|  | вони | ||||||
|  | воно | ||||||
|  | восьмий | ||||||
|  | всього | ||||||
|  | втім | ||||||
|  | г | ||||||
|  | геть | ||||||
|  | говорив | ||||||
|  | говорить | ||||||
|  | давно | ||||||
|  | далеко | ||||||
|  | далі | ||||||
|  | дарма | ||||||
|  | два | ||||||
|  | двадцятий | ||||||
|  | двадцять | ||||||
|  | дванадцятий | ||||||
|  | дванадцять | ||||||
|  | дві | ||||||
|  | двох | ||||||
|  | де | ||||||
|  | дев'ятий | ||||||
|  | дев'ятнадцятий | ||||||
|  | дев'ятнадцять | ||||||
|  | дев'ять | ||||||
|  | декілька | ||||||
|  | день | ||||||
|  | десятий | ||||||
|  | десять | ||||||
|  | дійсно | ||||||
|  | для | ||||||
|  | дня | ||||||
|  | до | ||||||
|  | добре | ||||||
|  | довго | ||||||
|  | доки | ||||||
|  | досить | ||||||
|  | другий | ||||||
|  | дуже | ||||||
|  | же | ||||||
|  | життя | ||||||
|  | з | ||||||
|  | за | ||||||
|  | завжди | ||||||
|  | зазвичай | ||||||
|  | зайнята | ||||||
|  | зайнятий | ||||||
|  | зайняті | ||||||
|  | зайнято | ||||||
|  | занадто | ||||||
|  | зараз | ||||||
|  | зате | ||||||
|  | звичайно | ||||||
|  | звідси | ||||||
|  | звідусіль | ||||||
|  | здається | ||||||
|  | значить | ||||||
|  | знову | ||||||
|  | зовсім | ||||||
|  | ім'я | ||||||
|  | іноді | ||||||
|  | інша | ||||||
|  | інше | ||||||
|  | інший | ||||||
|  | інших | ||||||
|  | інші | ||||||
|  | її | ||||||
|  | їй | ||||||
|  | їх | ||||||
|  | його | ||||||
|  | йому | ||||||
|  | ким | ||||||
|  | кого | ||||||
|  | кожен | ||||||
|  | кожна | ||||||
|  | кожне | ||||||
|  | кожні | ||||||
|  | коли | ||||||
|  | кому | ||||||
|  | краще | ||||||
|  | крейдуючи | ||||||
|  | кругом | ||||||
|  | куди | ||||||
|  | ласка | ||||||
|  | лише | ||||||
|  | люди | ||||||
|  | людина | ||||||
|  | майже | ||||||
|  | мало | ||||||
|  | мати | ||||||
|  | мене | ||||||
|  | мені | ||||||
|  | менш | ||||||
|  | менше | ||||||
|  | ми | ||||||
|  | мимо | ||||||
|  | міг | ||||||
|  | між | ||||||
|  | мій | ||||||
|  | мільйонів | ||||||
|  | мною | ||||||
|  | могти | ||||||
|  | моє | ||||||
|  | мож | ||||||
|  | може | ||||||
|  | можна | ||||||
|  | можно | ||||||
|  | можуть | ||||||
|  | можхо | ||||||
|  | мої | ||||||
|  | мор | ||||||
|  | моя | ||||||
|  | на | ||||||
|  | навіть | ||||||
|  | навіщо | ||||||
|  | навкруги | ||||||
|  | нагорі | ||||||
|  | над | ||||||
|  | назад | ||||||
|  | найбільш | ||||||
|  | нам | ||||||
|  | нами | ||||||
|  | нарешті | ||||||
|  | нас | ||||||
|  | наш | ||||||
|  | наша | ||||||
|  | наше | ||||||
|  | наші | ||||||
|  | не | ||||||
|  | небагато | ||||||
|  | недалеко | ||||||
|  | немає | ||||||
|  | нерідко | ||||||
|  | нещодавно | ||||||
|  | нею | ||||||
|  | нибудь | ||||||
|  | нижче | ||||||
|  | низько | ||||||
|  | ним | ||||||
|  | ними | ||||||
|  | них | ||||||
|  | ні | ||||||
|  | ніби | ||||||
|  | ніколи | ||||||
|  | нікуди | ||||||
|  | нічого | ||||||
|  | ну | ||||||
|  | нх | ||||||
|  | нього | ||||||
|  | о | ||||||
|  | обоє | ||||||
|  | один | ||||||
|  | одинадцятий | ||||||
|  | одинадцять | ||||||
|  | однієї | ||||||
|  | одній | ||||||
|  | одного | ||||||
|  | означає | ||||||
|  | окрім | ||||||
|  | он | ||||||
|  | особливо | ||||||
|  | ось | ||||||
|  | перед | ||||||
|  | перший | ||||||
|  | під | ||||||
|  | пізніше | ||||||
|  | пір | ||||||
|  | по | ||||||
|  | повинно | ||||||
|  | подів | ||||||
|  | поки | ||||||
|  | пора | ||||||
|  | поруч | ||||||
|  | посеред | ||||||
|  | потім | ||||||
|  | потрібно | ||||||
|  | почала | ||||||
|  | прекрасне | ||||||
|  | прекрасно | ||||||
|  | при | ||||||
|  | про | ||||||
|  | просто | ||||||
|  | проте | ||||||
|  | проти | ||||||
|  | п'ятий | ||||||
|  | п'ятнадцятий | ||||||
|  | п'ятнадцять | ||||||
|  | п'ять | ||||||
|  | раз | ||||||
|  | раніше | ||||||
|  | рано | ||||||
|  | раптом | ||||||
|  | рік | ||||||
|  | роки | ||||||
|  | років | ||||||
|  | року | ||||||
|  | сам | ||||||
|  | сама | ||||||
|  | саме | ||||||
|  | самим | ||||||
|  | самими | ||||||
|  | самих | ||||||
|  | самі | ||||||
|  | самій | ||||||
|  | само | ||||||
|  | самого | ||||||
|  | самому | ||||||
|  | саму | ||||||
|  | світу | ||||||
|  | свого | ||||||
|  | своє | ||||||
|  | свої | ||||||
|  | своїй | ||||||
|  | своїх | ||||||
|  | свою | ||||||
|  | сеаой | ||||||
|  | себе | ||||||
|  | сім | ||||||
|  | сімнадцятий | ||||||
|  | сімнадцять | ||||||
|  | сказав | ||||||
|  | сказала | ||||||
|  | сказати | ||||||
|  | скільки | ||||||
|  | скрізь | ||||||
|  | собі | ||||||
|  | собою | ||||||
|  | спасибі | ||||||
|  | спочатку | ||||||
|  | справ | ||||||
|  | став | ||||||
|  | суть | ||||||
|  | сьогодні | ||||||
|  | сьомий | ||||||
|  | т | ||||||
|  | та | ||||||
|  | так | ||||||
|  | така | ||||||
|  | таке | ||||||
|  | такий | ||||||
|  | такі | ||||||
|  | також | ||||||
|  | там | ||||||
|  | твій | ||||||
|  | твоє | ||||||
|  | твоя | ||||||
|  | те | ||||||
|  | тебе | ||||||
|  | теж | ||||||
|  | тепер | ||||||
|  | ти | ||||||
|  | тим | ||||||
|  | тими | ||||||
|  | тисяч | ||||||
|  | тих | ||||||
|  | ті | ||||||
|  | тією | ||||||
|  | тільки | ||||||
|  | тобі | ||||||
|  | тобою | ||||||
|  | того | ||||||
|  | тоді | ||||||
|  | той | ||||||
|  | том | ||||||
|  | тому | ||||||
|  | треба | ||||||
|  | третій | ||||||
|  | три | ||||||
|  | тринадцятий | ||||||
|  | тринадцять | ||||||
|  | трохи | ||||||
|  | ту | ||||||
|  | туди | ||||||
|  | тут | ||||||
|  | у | ||||||
|  | увесь | ||||||
|  | уміти | ||||||
|  | усе | ||||||
|  | усі | ||||||
|  | усім | ||||||
|  | усіма | ||||||
|  | усіх | ||||||
|  | усію | ||||||
|  | усього | ||||||
|  | усьому | ||||||
|  | усю | ||||||
|  | усюди | ||||||
|  | уся | ||||||
|  | хіба | ||||||
|  | хотіти | ||||||
|  | хоч | ||||||
|  | хоча | ||||||
|  | хочеш | ||||||
|  | хто | ||||||
|  | це | ||||||
|  | цей | ||||||
|  | цим | ||||||
|  | цими | ||||||
|  | цих | ||||||
|  | ці | ||||||
|  | цій | ||||||
|  | цього | ||||||
|  | цьому | ||||||
|  | цю | ||||||
|  | ця | ||||||
|  | час | ||||||
|  | частіше | ||||||
|  | часто | ||||||
|  | часу | ||||||
|  | через | ||||||
|  | четвертий | ||||||
|  | чи | ||||||
|  | чим | ||||||
|  | численна | ||||||
|  | численне | ||||||
|  | численний | ||||||
|  | численні | ||||||
|  | чого | ||||||
|  | чому | ||||||
|  | чотири | ||||||
|  | чотирнадцятий | ||||||
|  | чотирнадцять | ||||||
|  | шістнадцятий | ||||||
|  | шістнадцять | ||||||
|  | шість | ||||||
|  | шостий | ||||||
|  | ще | ||||||
|  | що | ||||||
|  | щоб | ||||||
|  | я | ||||||
|  | як | ||||||
|  | яка | ||||||
|  | який | ||||||
|  | яких | ||||||
|  | які | ||||||
|  | якій | ||||||
|  | якого | ||||||
|  | якщо | ||||||
|  | """.split()) | ||||||
							
								
								
									
										36
									
								
								spacy/lang/uk/tag_map.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								spacy/lang/uk/tag_map.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,36 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ | ||||||
|  | from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Add a tag map | ||||||
|  | # Documentation: https://spacy.io/docs/usage/adding-languages#tag-map | ||||||
|  | # Universal Dependencies: http://universaldependencies.org/u/pos/all.html | ||||||
|  | # The keys of the tag map should be strings in your tag set. The dictionary must | ||||||
|  | # have an entry POS whose value is one of the Universal Dependencies tags. | ||||||
|  | # Optionally, you can also include morphological features or other attributes. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | TAG_MAP = { | ||||||
|  |     "ADV":      {POS: ADV}, | ||||||
|  |     "NOUN":     {POS: NOUN}, | ||||||
|  |     "ADP":      {POS: ADP}, | ||||||
|  |     "PRON":     {POS: PRON}, | ||||||
|  |     "SCONJ":    {POS: SCONJ}, | ||||||
|  |     "PROPN":    {POS: PROPN}, | ||||||
|  |     "DET":      {POS: DET}, | ||||||
|  |     "SYM":      {POS: SYM}, | ||||||
|  |     "INTJ":     {POS: INTJ}, | ||||||
|  |     "PUNCT":    {POS: PUNCT}, | ||||||
|  |     "NUM":      {POS: NUM}, | ||||||
|  |     "AUX":      {POS: AUX}, | ||||||
|  |     "X":        {POS: X}, | ||||||
|  |     "CONJ":     {POS: CONJ}, | ||||||
|  |     "CCONJ":    {POS: CCONJ}, | ||||||
|  |     "ADJ":      {POS: ADJ}, | ||||||
|  |     "VERB":     {POS: VERB}, | ||||||
|  |     "PART":     {POS: PART}, | ||||||
|  |     "SP":     	{POS: SPACE} | ||||||
|  | } | ||||||
							
								
								
									
										38
									
								
								spacy/lang/uk/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								spacy/lang/uk/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,38 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | # import symbols – if you need to use more, add them here | ||||||
|  | from ...symbols import ORTH, LEMMA, POS, NORM, NOUN | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Add tokenizer exceptions | ||||||
|  | # Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions | ||||||
|  | # Feel free to use custom logic to generate repetitive exceptions more efficiently. | ||||||
|  | # If an exception is split into more than one token, the ORTH values combined always | ||||||
|  | # need to match the original string. | ||||||
|  | 
 | ||||||
|  | # Exceptions should be added in the following format: | ||||||
|  | 
 | ||||||
|  | _exc = {} | ||||||
|  | 
 | ||||||
|  | for exc_data in [ | ||||||
|  |     {ORTH: "вул.", LEMMA: "вулиця", NORM: "вулиця", POS: NOUN}, | ||||||
|  |     {ORTH: "ім.", LEMMA: "ім'я", NORM: "імені", POS: NOUN}, | ||||||
|  |     {ORTH: "просп.", LEMMA: "проспект", NORM: "проспект", POS: NOUN}, | ||||||
|  |     {ORTH: "бул.", LEMMA: "бульвар", NORM: "бульвар", POS: NOUN}, | ||||||
|  |     {ORTH: "пров.", LEMMA: "провулок", NORM: "провулок", POS: NOUN}, | ||||||
|  |     {ORTH: "пл.", LEMMA: "площа", NORM: "площа", POS: NOUN}, | ||||||
|  |     {ORTH: "г.", LEMMA: "гора", NORM: "гора", POS: NOUN}, | ||||||
|  |     {ORTH: "п.", LEMMA: "пан", NORM: "пан", POS: NOUN}, | ||||||
|  |     {ORTH: "м.", LEMMA: "місто", NORM: "місто", POS: NOUN}, | ||||||
|  |     {ORTH: "проф.", LEMMA: "професор", NORM: "професор", POS: NOUN}, | ||||||
|  |     {ORTH: "акад.", LEMMA: "академік", NORM: "академік", POS: NOUN}, | ||||||
|  |     {ORTH: "доц.", LEMMA: "доцент", NORM: "доцент", POS: NOUN}, | ||||||
|  |     {ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN}]: | ||||||
|  |     _exc[exc_data[ORTH]] = [exc_data] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # To keep things clean and readable, it's recommended to only declare the | ||||||
|  | # TOKENIZER_EXCEPTIONS at the bottom: | ||||||
|  | 
 | ||||||
|  | TOKENIZER_EXCEPTIONS = _exc | ||||||
|  | @ -117,6 +117,12 @@ def tr_tokenizer(): | ||||||
|     return get_lang_class("tr").Defaults.create_tokenizer() |     return get_lang_class("tr").Defaults.create_tokenizer() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.fixture(scope="session") | ||||||
|  | def uk_tokenizer(): | ||||||
|  |     pymorphy = pytest.importorskip("pymorphy2") | ||||||
|  |     return util.get_lang_class("uk").Defaults.create_tokenizer() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @pytest.fixture(scope="session") | @pytest.fixture(scope="session") | ||||||
| def ca_tokenizer(): | def ca_tokenizer(): | ||||||
|     return get_lang_class("ca").Defaults.create_tokenizer() |     return get_lang_class("ca").Defaults.create_tokenizer() | ||||||
|  |  | ||||||
							
								
								
									
										0
									
								
								spacy/tests/lang/uk/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/uk/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										128
									
								
								spacy/tests/lang/uk/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										128
									
								
								spacy/tests/lang/uk/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,128 @@ | ||||||
|  | # coding: utf-8 | ||||||
|  | """Test that open, closed and paired punctuation is split off correctly.""" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | PUNCT_OPEN = ['(', '[', '{', '*'] | ||||||
|  | PUNCT_CLOSE = [')', ']', '}', '*'] | ||||||
|  | PUNCT_PAIRED = [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('text', ["(", "((", "<"]) | ||||||
|  | def test_uk_tokenizer_handles_only_punct(uk_tokenizer, text): | ||||||
|  |     tokens = uk_tokenizer(text) | ||||||
|  |     assert len(tokens) == len(text) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('punct', PUNCT_OPEN) | ||||||
|  | @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||||
|  | def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text): | ||||||
|  |     tokens = uk_tokenizer(punct + text) | ||||||
|  |     assert len(tokens) == 2 | ||||||
|  |     assert tokens[0].text == punct | ||||||
|  |     assert tokens[1].text == text | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('punct', PUNCT_CLOSE) | ||||||
|  | @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||||
|  | def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text): | ||||||
|  |     tokens = uk_tokenizer(text + punct) | ||||||
|  |     assert len(tokens) == 2 | ||||||
|  |     assert tokens[0].text == text | ||||||
|  |     assert tokens[1].text == punct | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('punct', PUNCT_OPEN) | ||||||
|  | @pytest.mark.parametrize('punct_add', ["`"]) | ||||||
|  | @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||||
|  | def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add, text): | ||||||
|  |     tokens = uk_tokenizer(punct + punct_add + text) | ||||||
|  |     assert len(tokens) == 3 | ||||||
|  |     assert tokens[0].text == punct | ||||||
|  |     assert tokens[1].text == punct_add | ||||||
|  |     assert tokens[2].text == text | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('punct', PUNCT_CLOSE) | ||||||
|  | @pytest.mark.parametrize('punct_add', ["'"]) | ||||||
|  | @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||||
|  | def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add, text): | ||||||
|  |     tokens = uk_tokenizer(text + punct + punct_add) | ||||||
|  |     assert len(tokens) == 3 | ||||||
|  |     assert tokens[0].text == text | ||||||
|  |     assert tokens[1].text == punct | ||||||
|  |     assert tokens[2].text == punct_add | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('punct', PUNCT_OPEN) | ||||||
|  | @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||||
|  | def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text): | ||||||
|  |     tokens = uk_tokenizer(punct + punct + punct + text) | ||||||
|  |     assert len(tokens) == 4 | ||||||
|  |     assert tokens[0].text == punct | ||||||
|  |     assert tokens[3].text == text | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('punct', PUNCT_CLOSE) | ||||||
|  | @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||||
|  | def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text): | ||||||
|  |     tokens = uk_tokenizer(text + punct + punct + punct) | ||||||
|  |     assert len(tokens) == 4 | ||||||
|  |     assert tokens[0].text == text | ||||||
|  |     assert tokens[1].text == punct | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('text', ["'Тест"]) | ||||||
|  | def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text): | ||||||
|  |     tokens = uk_tokenizer(text) | ||||||
|  |     assert len(tokens) == 2 | ||||||
|  |     assert tokens[0].text == "'" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('text', ["Тест''"]) | ||||||
|  | def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text): | ||||||
|  |     tokens = uk_tokenizer(text) | ||||||
|  |     assert len(tokens) == 2 | ||||||
|  |     tokens_punct = uk_tokenizer("''") | ||||||
|  |     assert len(tokens_punct) == 1 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) | ||||||
|  | @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||||
|  | def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open, | ||||||
|  |                                               punct_close, text): | ||||||
|  |     tokens = uk_tokenizer(punct_open + text + punct_close) | ||||||
|  |     assert len(tokens) == 3 | ||||||
|  |     assert tokens[0].text == punct_open | ||||||
|  |     assert tokens[1].text == text | ||||||
|  |     assert tokens[2].text == punct_close | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) | ||||||
|  | @pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")]) | ||||||
|  | @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | ||||||
|  | def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close, | ||||||
|  |                                      punct_open2, punct_close2, text): | ||||||
|  |     tokens = uk_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2) | ||||||
|  |     assert len(tokens) == 5 | ||||||
|  |     assert tokens[0].text == punct_open2 | ||||||
|  |     assert tokens[1].text == punct_open | ||||||
|  |     assert tokens[2].text == text | ||||||
|  |     assert tokens[3].text == punct_close | ||||||
|  |     assert tokens[4].text == punct_close2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('text', ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."]) | ||||||
|  | def test_uk_tokenizer_splits_trailing_dot(uk_tokenizer, text): | ||||||
|  |     tokens = uk_tokenizer(text) | ||||||
|  |     assert tokens[1].text == "." | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_uk_tokenizer_splits_bracket_period(uk_tokenizer): | ||||||
|  |     text = "(Раз, два, три, проверка)." | ||||||
|  |     tokens = uk_tokenizer(text) | ||||||
|  |     assert tokens[len(tokens) - 1].text == "." | ||||||
							
								
								
									
										18
									
								
								spacy/tests/lang/uk/test_tokenizer_exc.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/tests/lang/uk/test_tokenizer_exc.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | ||||||
|  | # coding: utf-8 | ||||||
|  | """Test that tokenizer exceptions are parsed correctly.""" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('text,norms,lemmas', [("ім.", ["імені"], ["ім'я"]), | ||||||
|  |                                         ("проф.", ["професор"], ["професор"])]) | ||||||
|  | def test_uk_tokenizer_abbrev_exceptions(uk_tokenizer, text, norms, lemmas): | ||||||
|  |     tokens = uk_tokenizer(text) | ||||||
|  |     assert len(tokens) == 1 | ||||||
|  |     assert [token.norm_ for token in tokens] == norms | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user