mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						c86445bdfd
					
				|  | @ -73,10 +73,10 @@ def generate_sentence(sent): | |||
|     tokens = [] | ||||
|     for i, id in enumerate(id_): | ||||
|         token = {} | ||||
|         token["orth"] = word[id] | ||||
|         token["tag"] = tag[id] | ||||
|         token["head"] = head[id] - i | ||||
|         token["dep"] = dep[id] | ||||
|         token["orth"] = word[i] | ||||
|         token["tag"] = tag[i] | ||||
|         token["head"] = head[i] - id | ||||
|         token["dep"] = dep[i] | ||||
|         tokens.append(token) | ||||
|     sentence["tokens"] = tokens | ||||
|     return sentence | ||||
|  |  | |||
|  | @ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH | |||
| from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lemmatizer import LOOKUP | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
|  | @ -24,6 +25,7 @@ class FrenchDefaults(Language.Defaults): | |||
|     infixes = tuple(TOKENIZER_INFIXES) | ||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) | ||||
|     token_match = TOKEN_MATCH | ||||
|     syntax_iterators = dict(SYNTAX_ITERATORS) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_lemmatizer(cls, nlp=None): | ||||
|  |  | |||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										42
									
								
								spacy/lang/fr/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								spacy/lang/fr/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,42 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ...symbols import NOUN, PROPN, PRON | ||||
| 
 | ||||
| 
 | ||||
| def noun_chunks(obj): | ||||
|     """ | ||||
|     Detect base noun phrases from a dependency parse. Works on both Doc and Span. | ||||
|     """ | ||||
|     labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss'] | ||||
|     doc = obj.doc  # Ensure works on both Doc and Span. | ||||
|     np_deps = [doc.vocab.strings[label] for label in labels] | ||||
|     conj = doc.vocab.strings.add('conj') | ||||
|     np_label = doc.vocab.strings.add('NP') | ||||
|     seen = set() | ||||
|     for i, word in enumerate(obj): | ||||
|         if word.pos not in (NOUN, PROPN, PRON): | ||||
|             continue | ||||
|         # Prevent nested chunks from being produced | ||||
|         if word.i in seen: | ||||
|             continue | ||||
|         if word.dep in np_deps: | ||||
|             if any(w.i in seen for w in word.subtree): | ||||
|                 continue | ||||
|             seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) | ||||
|             yield word.left_edge.i, word.right_edge.i+1, np_label | ||||
|         elif word.dep == conj: | ||||
|             head = word.head | ||||
|             while head.dep == conj and head.head.i < head.i: | ||||
|                 head = head.head | ||||
|             # If the head is an NP, and we're coordinated to it, we're an NP | ||||
|             if head.dep in np_deps: | ||||
|                 if any(w.i in seen for w in word.subtree): | ||||
|                     continue | ||||
|                 seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) | ||||
|                 yield word.left_edge.i, word.right_edge.i+1, np_label | ||||
| 
 | ||||
| 
 | ||||
| SYNTAX_ITERATORS = { | ||||
|     'noun_chunks': noun_chunks | ||||
| } | ||||
|  | @ -1,6 +1,7 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .stop_words import STOP_WORDS | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
|  | @ -15,7 +16,7 @@ class PolishDefaults(Language.Defaults): | |||
|     lex_attr_getters[LANG] = lambda text: 'pl' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     stop_words = set(STOP_WORDS) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										23
									
								
								spacy/lang/pl/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								spacy/lang/pl/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,23 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..symbols import ORTH, LEMMA, POS | ||||
| 
 | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
| for exc_data in [ | ||||
|     {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV}, | ||||
|     {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN}, | ||||
|     {ORTH: "mgr.", LEMMA: "magister", POS: NOUN}, | ||||
|     {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV}, | ||||
|     {ORTH: "tj.", LEMMA: "to jest", POS: ADV}, | ||||
|     {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]: | ||||
|     _exc[exc_data[ORTH]] = [dict(exc_data)], | ||||
| 
 | ||||
| for orth in [ | ||||
|     "w.", "r."]: | ||||
|     _exc[orth] = [{ORTH: orth}] | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = dict(_exc) | ||||
|  | @ -15,6 +15,7 @@ class Chinese(Language): | |||
|             raise ImportError("The Chinese tokenizer requires the Jieba library: " | ||||
|                               "https://github.com/fxsjy/jieba") | ||||
|         words = list(jieba.cut(text, cut_all=True)) | ||||
|         words=[x for x in words if x] | ||||
|         return Doc(self.vocab, words=words, spaces=[False]*len(words)) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -110,5 +110,35 @@ def es_noun_chunks(obj): | |||
|         token = next_token(token) | ||||
| 
 | ||||
| 
 | ||||
| def french_noun_chunks(obj): | ||||
|     labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss'] | ||||
|     doc = obj.doc  # Ensure works on both Doc and Span. | ||||
|     np_deps = [doc.vocab.strings[label] for label in labels] | ||||
|     conj = doc.vocab.strings.add('conj') | ||||
|     np_label = doc.vocab.strings.add('NP') | ||||
|     seen = set() | ||||
|     for i, word in enumerate(obj): | ||||
|         if word.pos not in (NOUN, PROPN, PRON): | ||||
|             continue | ||||
|         # Prevent nested chunks from being produced | ||||
|         if word.i in seen: | ||||
|             continue | ||||
|         if word.dep in np_deps: | ||||
|             if any(w.i in seen for w in word.subtree): | ||||
|                 continue | ||||
|             seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) | ||||
|             yield word.left_edge.i, word.right_edge.i+1, np_label | ||||
|         elif word.dep == conj: | ||||
|             head = word.head | ||||
|             while head.dep == conj and head.head.i < head.i: | ||||
|                 head = head.head | ||||
|             # If the head is an NP, and we're coordinated to it, we're an NP | ||||
|             if head.dep in np_deps: | ||||
|                 if any(w.i in seen for w in word.subtree): | ||||
|                     continue | ||||
|                 seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) | ||||
|                 yield word.left_edge.i, word.right_edge.i+1, np_label | ||||
| 
 | ||||
| 
 | ||||
| CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, | ||||
|             'es': es_noun_chunks} | ||||
|             'es': es_noun_chunks, 'fr': french_noun_chunks} | ||||
|  |  | |||
|  | @ -18,7 +18,7 @@ p | |||
| 
 | ||||
|     # Construction 2 | ||||
|     from spacy.tokens import Doc | ||||
|     doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], | ||||
|     doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], | ||||
|                                spaces=[True, False, False]) | ||||
| 
 | ||||
| +h(2, "init") Doc.__init__ | ||||
|  |  | |||
|  | @ -18,7 +18,7 @@ p | |||
|             +cell=cell | ||||
| 
 | ||||
| p | ||||
|     |  Fist, the raw text is split on whitespace characters, similar to | ||||
|     |  First, the raw text is split on whitespace characters, similar to | ||||
|     |  #[code text.split(' ')]. Then, the tokenizer processes the text from | ||||
|     |  left to right. On each substring, it performs two checks: | ||||
| 
 | ||||
|  |  | |||
|  | @ -181,7 +181,7 @@ p | |||
|     from spacy.vocab import Vocab | ||||
| 
 | ||||
|     nlp = spacy.load('en') | ||||
|     moby_dick = open('moby_dick.txt', 'r') | ||||
|     moby_dick = open('moby_dick.txt', 'r').read() | ||||
|     doc = nlp(moby_dick) | ||||
|     doc.to_disk('/moby_dick.bin') | ||||
| 
 | ||||
|  |  | |||
|  | @ -4,7 +4,7 @@ include ../../_includes/_mixins | |||
| 
 | ||||
| p | ||||
|     |  As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy] | ||||
|     |  and #[+a(DEMOS_URL + "displacy-ent") displaCy #[sup ENT]] are finally an | ||||
|     |  and #[+a(DEMOS_URL + "/displacy-ent") displaCy #[sup ENT]] are finally an | ||||
|     |  official part of the library. Visualizing a dependency parse or named | ||||
|     |  entities in a text is not only a fun NLP demo – it can also be incredibly | ||||
|     |  helpful in speeding up development and debugging your code and training | ||||
|  |  | |||
|  | @ -77,7 +77,7 @@ p | |||
| 
 | ||||
| +code. | ||||
|     doc1 = nlp(u"Paris is the largest city in France.") | ||||
|     doc2 = nlp(u"Ljubljana is the capital of Lithuania.") | ||||
|     doc2 = nlp(u"Vilnius is the capital of Lithuania.") | ||||
|     doc3 = nlp(u"An emu is a large bird.") | ||||
| 
 | ||||
|     for doc in [doc1, doc2, doc3]: | ||||
|  | @ -85,13 +85,13 @@ p | |||
|             print(doc.similarity(other_doc)) | ||||
| 
 | ||||
| p | ||||
|     |  Even though the sentences about Paris and Ljubljana consist of different | ||||
|     |  Even though the sentences about Paris and Vilnius consist of different | ||||
|     |  words and entities, they both describe the same concept and are seen as | ||||
|     |  more similar than the sentence about emus. In this case, even a misspelled | ||||
|     |  version of "Ljubljana" would still produce very similar results. | ||||
|     |  version of "Vilnius" would still produce very similar results. | ||||
| 
 | ||||
| +table | ||||
|     - var examples = {"Paris is the largest city in France.": [1, 0.84, 0.65], "Ljubljana is the capital of Lithuania.": [0.84, 1, 0.52], "An emu is a large bird.": [0.65, 0.52, 1]} | ||||
|     - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]} | ||||
|     - var counter = 0 | ||||
| 
 | ||||
|     +row | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user