mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						c86445bdfd
					
				|  | @ -73,10 +73,10 @@ def generate_sentence(sent): | ||||||
|     tokens = [] |     tokens = [] | ||||||
|     for i, id in enumerate(id_): |     for i, id in enumerate(id_): | ||||||
|         token = {} |         token = {} | ||||||
|         token["orth"] = word[id] |         token["orth"] = word[i] | ||||||
|         token["tag"] = tag[id] |         token["tag"] = tag[i] | ||||||
|         token["head"] = head[id] - i |         token["head"] = head[i] - id | ||||||
|         token["dep"] = dep[id] |         token["dep"] = dep[i] | ||||||
|         tokens.append(token) |         tokens.append(token) | ||||||
|     sentence["tokens"] = tokens |     sentence["tokens"] = tokens | ||||||
|     return sentence |     return sentence | ||||||
|  |  | ||||||
|  | @ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH | ||||||
| from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .lemmatizer import LOOKUP | from .lemmatizer import LOOKUP | ||||||
|  | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
| 
 | 
 | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
|  | @ -24,6 +25,7 @@ class FrenchDefaults(Language.Defaults): | ||||||
|     infixes = tuple(TOKENIZER_INFIXES) |     infixes = tuple(TOKENIZER_INFIXES) | ||||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) |     suffixes = tuple(TOKENIZER_SUFFIXES) | ||||||
|     token_match = TOKEN_MATCH |     token_match = TOKEN_MATCH | ||||||
|  |     syntax_iterators = dict(SYNTAX_ITERATORS) | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def create_lemmatizer(cls, nlp=None): |     def create_lemmatizer(cls, nlp=None): | ||||||
|  |  | ||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										42
									
								
								spacy/lang/fr/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								spacy/lang/fr/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,42 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from ...symbols import NOUN, PROPN, PRON | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def noun_chunks(obj): | ||||||
|  |     """ | ||||||
|  |     Detect base noun phrases from a dependency parse. Works on both Doc and Span. | ||||||
|  |     """ | ||||||
|  |     labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss'] | ||||||
|  |     doc = obj.doc  # Ensure works on both Doc and Span. | ||||||
|  |     np_deps = [doc.vocab.strings[label] for label in labels] | ||||||
|  |     conj = doc.vocab.strings.add('conj') | ||||||
|  |     np_label = doc.vocab.strings.add('NP') | ||||||
|  |     seen = set() | ||||||
|  |     for i, word in enumerate(obj): | ||||||
|  |         if word.pos not in (NOUN, PROPN, PRON): | ||||||
|  |             continue | ||||||
|  |         # Prevent nested chunks from being produced | ||||||
|  |         if word.i in seen: | ||||||
|  |             continue | ||||||
|  |         if word.dep in np_deps: | ||||||
|  |             if any(w.i in seen for w in word.subtree): | ||||||
|  |                 continue | ||||||
|  |             seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) | ||||||
|  |             yield word.left_edge.i, word.right_edge.i+1, np_label | ||||||
|  |         elif word.dep == conj: | ||||||
|  |             head = word.head | ||||||
|  |             while head.dep == conj and head.head.i < head.i: | ||||||
|  |                 head = head.head | ||||||
|  |             # If the head is an NP, and we're coordinated to it, we're an NP | ||||||
|  |             if head.dep in np_deps: | ||||||
|  |                 if any(w.i in seen for w in word.subtree): | ||||||
|  |                     continue | ||||||
|  |                 seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) | ||||||
|  |                 yield word.left_edge.i, word.right_edge.i+1, np_label | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | SYNTAX_ITERATORS = { | ||||||
|  |     'noun_chunks': noun_chunks | ||||||
|  | } | ||||||
|  | @ -1,6 +1,7 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
|  | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| 
 | 
 | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
|  | @ -15,7 +16,7 @@ class PolishDefaults(Language.Defaults): | ||||||
|     lex_attr_getters[LANG] = lambda text: 'pl' |     lex_attr_getters[LANG] = lambda text: 'pl' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 | 
 | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = set(STOP_WORDS) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										23
									
								
								spacy/lang/pl/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								spacy/lang/pl/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,23 @@ | ||||||
|  | # encoding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from ..symbols import ORTH, LEMMA, POS | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _exc = {} | ||||||
|  | 
 | ||||||
|  | for exc_data in [ | ||||||
|  |     {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV}, | ||||||
|  |     {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN}, | ||||||
|  |     {ORTH: "mgr.", LEMMA: "magister", POS: NOUN}, | ||||||
|  |     {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV}, | ||||||
|  |     {ORTH: "tj.", LEMMA: "to jest", POS: ADV}, | ||||||
|  |     {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]: | ||||||
|  |     _exc[exc_data[ORTH]] = [dict(exc_data)], | ||||||
|  | 
 | ||||||
|  | for orth in [ | ||||||
|  |     "w.", "r."]: | ||||||
|  |     _exc[orth] = [{ORTH: orth}] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | TOKENIZER_EXCEPTIONS = dict(_exc) | ||||||
|  | @ -15,6 +15,7 @@ class Chinese(Language): | ||||||
|             raise ImportError("The Chinese tokenizer requires the Jieba library: " |             raise ImportError("The Chinese tokenizer requires the Jieba library: " | ||||||
|                               "https://github.com/fxsjy/jieba") |                               "https://github.com/fxsjy/jieba") | ||||||
|         words = list(jieba.cut(text, cut_all=True)) |         words = list(jieba.cut(text, cut_all=True)) | ||||||
|  |         words=[x for x in words if x] | ||||||
|         return Doc(self.vocab, words=words, spaces=[False]*len(words)) |         return Doc(self.vocab, words=words, spaces=[False]*len(words)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -110,5 +110,35 @@ def es_noun_chunks(obj): | ||||||
|         token = next_token(token) |         token = next_token(token) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def french_noun_chunks(obj): | ||||||
|  |     labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss'] | ||||||
|  |     doc = obj.doc  # Ensure works on both Doc and Span. | ||||||
|  |     np_deps = [doc.vocab.strings[label] for label in labels] | ||||||
|  |     conj = doc.vocab.strings.add('conj') | ||||||
|  |     np_label = doc.vocab.strings.add('NP') | ||||||
|  |     seen = set() | ||||||
|  |     for i, word in enumerate(obj): | ||||||
|  |         if word.pos not in (NOUN, PROPN, PRON): | ||||||
|  |             continue | ||||||
|  |         # Prevent nested chunks from being produced | ||||||
|  |         if word.i in seen: | ||||||
|  |             continue | ||||||
|  |         if word.dep in np_deps: | ||||||
|  |             if any(w.i in seen for w in word.subtree): | ||||||
|  |                 continue | ||||||
|  |             seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) | ||||||
|  |             yield word.left_edge.i, word.right_edge.i+1, np_label | ||||||
|  |         elif word.dep == conj: | ||||||
|  |             head = word.head | ||||||
|  |             while head.dep == conj and head.head.i < head.i: | ||||||
|  |                 head = head.head | ||||||
|  |             # If the head is an NP, and we're coordinated to it, we're an NP | ||||||
|  |             if head.dep in np_deps: | ||||||
|  |                 if any(w.i in seen for w in word.subtree): | ||||||
|  |                     continue | ||||||
|  |                 seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) | ||||||
|  |                 yield word.left_edge.i, word.right_edge.i+1, np_label | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, | CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, | ||||||
|             'es': es_noun_chunks} |             'es': es_noun_chunks, 'fr': french_noun_chunks} | ||||||
|  |  | ||||||
|  | @ -18,7 +18,7 @@ p | ||||||
| 
 | 
 | ||||||
|     # Construction 2 |     # Construction 2 | ||||||
|     from spacy.tokens import Doc |     from spacy.tokens import Doc | ||||||
|     doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], |     doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], | ||||||
|                                spaces=[True, False, False]) |                                spaces=[True, False, False]) | ||||||
| 
 | 
 | ||||||
| +h(2, "init") Doc.__init__ | +h(2, "init") Doc.__init__ | ||||||
|  |  | ||||||
|  | @ -18,7 +18,7 @@ p | ||||||
|             +cell=cell |             +cell=cell | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Fist, the raw text is split on whitespace characters, similar to |     |  First, the raw text is split on whitespace characters, similar to | ||||||
|     |  #[code text.split(' ')]. Then, the tokenizer processes the text from |     |  #[code text.split(' ')]. Then, the tokenizer processes the text from | ||||||
|     |  left to right. On each substring, it performs two checks: |     |  left to right. On each substring, it performs two checks: | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -181,7 +181,7 @@ p | ||||||
|     from spacy.vocab import Vocab |     from spacy.vocab import Vocab | ||||||
| 
 | 
 | ||||||
|     nlp = spacy.load('en') |     nlp = spacy.load('en') | ||||||
|     moby_dick = open('moby_dick.txt', 'r') |     moby_dick = open('moby_dick.txt', 'r').read() | ||||||
|     doc = nlp(moby_dick) |     doc = nlp(moby_dick) | ||||||
|     doc.to_disk('/moby_dick.bin') |     doc.to_disk('/moby_dick.bin') | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -4,7 +4,7 @@ include ../../_includes/_mixins | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy] |     |  As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy] | ||||||
|     |  and #[+a(DEMOS_URL + "displacy-ent") displaCy #[sup ENT]] are finally an |     |  and #[+a(DEMOS_URL + "/displacy-ent") displaCy #[sup ENT]] are finally an | ||||||
|     |  official part of the library. Visualizing a dependency parse or named |     |  official part of the library. Visualizing a dependency parse or named | ||||||
|     |  entities in a text is not only a fun NLP demo – it can also be incredibly |     |  entities in a text is not only a fun NLP demo – it can also be incredibly | ||||||
|     |  helpful in speeding up development and debugging your code and training |     |  helpful in speeding up development and debugging your code and training | ||||||
|  |  | ||||||
|  | @ -77,7 +77,7 @@ p | ||||||
| 
 | 
 | ||||||
| +code. | +code. | ||||||
|     doc1 = nlp(u"Paris is the largest city in France.") |     doc1 = nlp(u"Paris is the largest city in France.") | ||||||
|     doc2 = nlp(u"Ljubljana is the capital of Lithuania.") |     doc2 = nlp(u"Vilnius is the capital of Lithuania.") | ||||||
|     doc3 = nlp(u"An emu is a large bird.") |     doc3 = nlp(u"An emu is a large bird.") | ||||||
| 
 | 
 | ||||||
|     for doc in [doc1, doc2, doc3]: |     for doc in [doc1, doc2, doc3]: | ||||||
|  | @ -85,13 +85,13 @@ p | ||||||
|             print(doc.similarity(other_doc)) |             print(doc.similarity(other_doc)) | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Even though the sentences about Paris and Ljubljana consist of different |     |  Even though the sentences about Paris and Vilnius consist of different | ||||||
|     |  words and entities, they both describe the same concept and are seen as |     |  words and entities, they both describe the same concept and are seen as | ||||||
|     |  more similar than the sentence about emus. In this case, even a misspelled |     |  more similar than the sentence about emus. In this case, even a misspelled | ||||||
|     |  version of "Ljubljana" would still produce very similar results. |     |  version of "Vilnius" would still produce very similar results. | ||||||
| 
 | 
 | ||||||
| +table | +table | ||||||
|     - var examples = {"Paris is the largest city in France.": [1, 0.84, 0.65], "Ljubljana is the capital of Lithuania.": [0.84, 1, 0.52], "An emu is a large bird.": [0.65, 0.52, 1]} |     - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]} | ||||||
|     - var counter = 0 |     - var counter = 0 | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user