mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						30e67fa808
					
				
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -101,7 +101,7 @@ def generate_meta(): | |||
| def generate_pipeline(): | ||||
|     prints("If set to 'True', the default pipeline is used. If set to 'False', " | ||||
|            "the pipeline will be disabled. Components should be specified as a " | ||||
|            "comma-separated list of component names, e.g. tensorizer, tagger, " | ||||
|            "comma-separated list of component names, e.g. tagger, " | ||||
|            "parser, ner. For more information, see the docs on processing pipelines.", | ||||
|            title="Enter your model's pipeline components") | ||||
|     pipeline = util.get_raw_input("Pipeline components", True) | ||||
|  |  | |||
|  | @ -7,8 +7,8 @@ from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos | |||
| 
 | ||||
| class Lemmatizer(object): | ||||
|     @classmethod | ||||
|     def load(cls, path, index=None, exc=None, rules=None): | ||||
|         return cls(index or {}, exc or {}, rules or {}) | ||||
|     def load(cls, path, index=None, exc=None, rules=None, lookup=None): | ||||
|         return cls(index or {}, exc or {}, rules or {}, lookup or {}) | ||||
| 
 | ||||
|     def __init__(self, index=None, exceptions=None, rules=None, lookup=None): | ||||
|         self.index = index if index is not None else {} | ||||
|  | @ -26,10 +26,10 @@ class Lemmatizer(object): | |||
|         elif univ_pos in (PUNCT, 'PUNCT', 'punct'): | ||||
|             univ_pos = 'punct' | ||||
|         else: | ||||
|             return set([string.lower()]) | ||||
|             return list(set([string.lower()])) | ||||
|         # See Issue #435 for example of where this logic is requied. | ||||
|         if self.is_base_form(univ_pos, morphology): | ||||
|             return set([string.lower()]) | ||||
|             return list(set([string.lower()])) | ||||
|         lemmas = lemmatize(string, self.index.get(univ_pos, {}), | ||||
|                            self.exc.get(univ_pos, {}), | ||||
|                            self.rules.get(univ_pos, [])) | ||||
|  | @ -108,4 +108,4 @@ def lemmatize(string, index, exceptions, rules): | |||
|         forms.extend(oov_forms) | ||||
|     if not forms: | ||||
|         forms.append(string) | ||||
|     return set(forms) | ||||
|     return list(set(forms)) | ||||
|  |  | |||
|  | @ -172,7 +172,7 @@ cdef class Morphology: | |||
|         cdef unicode py_string = self.strings[orth] | ||||
|         if self.lemmatizer is None: | ||||
|             return self.strings.add(py_string.lower()) | ||||
|         cdef set lemma_strings | ||||
|         cdef list lemma_strings | ||||
|         cdef unicode lemma_string | ||||
|         lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) | ||||
|         lemma_string = sorted(lemma_strings)[0] | ||||
|  |  | |||
|  | @ -100,7 +100,7 @@ def test_spans_are_hashable(en_tokenizer): | |||
|     assert hash(span1) != hash(span2) | ||||
|     span3 = tokens[0:2] | ||||
|     assert hash(span3) == hash(span1) | ||||
|   | ||||
| 
 | ||||
| 
 | ||||
| def test_spans_by_character(doc): | ||||
|     span1 = doc[1:-2] | ||||
|  | @ -117,3 +117,9 @@ def test_span_to_array(doc): | |||
|     assert arr[0, 0] == span[0].orth | ||||
|     assert arr[0, 1] == len(span[0]) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| def test_span_as_doc(doc): | ||||
|     span = doc[4:10] | ||||
|     span_doc = span.as_doc() | ||||
|     assert span.text == span_doc.text | ||||
|  |  | |||
|  | @ -181,7 +181,7 @@ mixin codepen(slug, height, default_tab) | |||
|     alt_file - [string] alternative file path used in footer and link button | ||||
|     height   - [integer] height of code preview in px | ||||
| 
 | ||||
| mixin github(repo, file, alt_file, height) | ||||
| mixin github(repo, file, alt_file, height, language) | ||||
|     - var branch = ALPHA ? "develop" : "master" | ||||
|     - var height = height || 250 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										46
									
								
								website/api/_annotation/_training.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								website/api/_annotation/_training.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,46 @@ | |||
| //- 💫 DOCS > API > ANNOTATION > TRAINING | ||||
| 
 | ||||
| p | ||||
|     |  spaCy takes training data in JSON format. The built-in | ||||
|     |  #[+api("cli#convert") #[code convert]] command helps you convert the | ||||
|     |  #[code .conllu] format used by the | ||||
|     |  #[+a("https://github.com/UniversalDependencies") Universal Dependencies corpora] | ||||
|     |  to spaCy's training format. | ||||
| 
 | ||||
| +aside("Annotating entities") | ||||
|     |  Named entities are provided in the #[+a("/api/annotation#biluo") BILUO] | ||||
|     |  notation. Tokens outside an entity are set to #[code "O"] and tokens | ||||
|     |  that are part of an entity are set to the entity label, prefixed by the | ||||
|     |  BILUO marker. For example #[code "B-ORG"] describes the first token of | ||||
|     |  a multi-token #[code ORG] entity and #[code "U-PERSON"] a single | ||||
|     |  token representing a #[code PERSON] entity | ||||
| 
 | ||||
| +code("Example structure"). | ||||
|     [{ | ||||
|         "id": int,                      # ID of the document within the corpus | ||||
|         "paragraphs": [{                # list of paragraphs in the corpus | ||||
|             "raw": string,              # raw text of the paragraph | ||||
|             "sentences": [{             # list of sentences in the paragraph | ||||
|                 "tokens": [{            # list of tokens in the sentence | ||||
|                     "id": int,          # index of the token in the document | ||||
|                     "dep": string,      # dependency label | ||||
|                     "head": int,        # offset of token head relative to token index | ||||
|                     "tag": string,      # part-of-speech tag | ||||
|                     "orth": string,     # verbatim text of the token | ||||
|                     "ner": string       # BILUO label, e.g. "O" or "B-ORG" | ||||
|                 }], | ||||
|                 "brackets": [{          # phrase structure (NOT USED by current models) | ||||
|                     "first": int,       # index of first token | ||||
|                     "last": int,        # index of last token | ||||
|                     "label": string     # phrase label | ||||
|                 }] | ||||
|             }] | ||||
|         }] | ||||
|     }] | ||||
| 
 | ||||
| p | ||||
|     |  Here's an example of dependencies, part-of-speech tags and names | ||||
|     |  entities, taken from the English Wall Street Journal portion of the Penn | ||||
|     |  Treebank: | ||||
| 
 | ||||
| +github("spacy", "examples/training/training-data.json", false, false, "json") | ||||
|  | @ -154,13 +154,16 @@ | |||
| 
 | ||||
|     "tokenizer": { | ||||
|         "title": "Tokenizer", | ||||
|         "teaser": "Segment text into words, punctuations marks etc.", | ||||
|         "tag": "class", | ||||
|         "source": "spacy/tokenizer.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "lemmatizer": { | ||||
|         "title": "Lemmatizer", | ||||
|         "tag": "class" | ||||
|         "teaser": "Assign the base forms of words.", | ||||
|         "tag": "class", | ||||
|         "source": "spacy/lemmatizer.py" | ||||
|     }, | ||||
| 
 | ||||
|     "tagger": { | ||||
|  |  | |||
|  | @ -101,31 +101,4 @@ p This document describes the target annotations spaCy is trained to predict. | |||
| +section("training") | ||||
|     +h(2, "json-input") JSON input format for training | ||||
| 
 | ||||
|     +under-construction | ||||
| 
 | ||||
|     p spaCy takes training data in the following format: | ||||
| 
 | ||||
|     +code("Example structure"). | ||||
|         doc: { | ||||
|             id: string, | ||||
|             paragraphs: [{ | ||||
|                 raw: string, | ||||
|                 sents: [int], | ||||
|                 tokens: [{ | ||||
|                     start: int, | ||||
|                     tag: string, | ||||
|                     head: int, | ||||
|                     dep: string | ||||
|                 }], | ||||
|                 ner: [{ | ||||
|                     start: int, | ||||
|                     end: int, | ||||
|                     label: string | ||||
|                 }], | ||||
|                 brackets: [{ | ||||
|                     start: int, | ||||
|                     end: int, | ||||
|                     label: string | ||||
|                 }] | ||||
|             }] | ||||
|         } | ||||
|     include _annotation/_training | ||||
|  |  | |||
|  | @ -2,4 +2,159 @@ | |||
| 
 | ||||
| include ../_includes/_mixins | ||||
| 
 | ||||
| +under-construction | ||||
| p | ||||
|     |  The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix | ||||
|     |  rules and lookup tables. | ||||
| 
 | ||||
| +h(2, "init") Lemmatizer.__init__ | ||||
|     +tag method | ||||
| 
 | ||||
| p Create a #[code Lemmatizer]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.lemmatizer import Lemmatizer | ||||
|     lemmatizer = Lemmatizer() | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code index] | ||||
|         +cell dict / #[code None] | ||||
|         +cell Inventory of lemmas in the language. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code exceptions] | ||||
|         +cell dict / #[code None] | ||||
|         +cell Mapping of string forms to lemmas that bypass the #[code rules]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code rules] | ||||
|         +cell dict / #[code None] | ||||
|         +cell List of suffix rewrite rules. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code lookup] | ||||
|         +cell dict / #[code None] | ||||
|         +cell Lookup table mapping string to their lemmas. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Lemmatizer] | ||||
|         +cell The newly created object. | ||||
| 
 | ||||
| +h(2, "call") Lemmatizer.__call__ | ||||
|     +tag method | ||||
| 
 | ||||
| p Lemmatize a string. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.lemmatizer import Lemmatizer | ||||
|     from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES | ||||
|     lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) | ||||
|     lemmas = lemmatizer(u'ducks', u'NOUN') | ||||
|     assert lemmas == [u'duck'] | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code string] | ||||
|         +cell unicode | ||||
|         +cell The string to lemmatize, e.g. the token text. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code univ_pos] | ||||
|         +cell unicode / int | ||||
|         +cell The token's universal part-of-speech tag. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code morphology] | ||||
|         +cell dict / #[code None] | ||||
|         +cell | ||||
|             |  Morphological features following the | ||||
|             |  #[+a("http://universaldependencies.org/") Universal Dependencies] | ||||
|             |  scheme. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell list | ||||
|         +cell The available lemmas for the string. | ||||
| 
 | ||||
| +h(2, "lookup") Lemmatizer.lookup | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Look up a lemma in the lookup table, if available. If no lemma is found, | ||||
|     |  the original string is returned. Languages can provide a | ||||
|     |  #[+a("/usage/adding-languages#lemmatizer") lookup table] via the | ||||
|     |  #[code lemma_lookup] variable, set on the individual #[code Language] | ||||
|     |  class. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     lookup = {u'going': u'go'} | ||||
|     lemmatizer = Lemmatizer(lookup=lookup) | ||||
|     assert lemmatizer.lookup(u'going') == u'go' | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code string] | ||||
|         +cell unicode | ||||
|         +cell The string to look up. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell unicode | ||||
|         +cell The lemma if the string was found, otherwise the original string. | ||||
| 
 | ||||
| +h(2, "is_base_form") Lemmatizer.is_base_form | ||||
|     +tag method | ||||
| 
 | ||||
| p | ||||
|     |  Check whether we're dealing with an uninflected paradigm, so we can | ||||
|     |  avoid lemmatization entirely. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     pos = 'verb' | ||||
|     morph = {'VerbForm': 'inf'} | ||||
|     is_base_form = lemmatizer.is_base_form(pos, morph) | ||||
|     assert is_base_form == True | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code univ_pos] | ||||
|         +cell unicode / int | ||||
|         +cell The token's universal part-of-speech tag. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code morphology] | ||||
|         +cell dict | ||||
|         +cell The token's morphological features. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell bool | ||||
|         +cell | ||||
|             |  Whether the token's part-of-speech tag and morphological features | ||||
|             |  describe a base form. | ||||
| 
 | ||||
| +h(2, "attributes") Attributes | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code index] | ||||
|         +cell dict / #[code None] | ||||
|         +cell Inventory of lemmas in the language. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code exc] | ||||
|         +cell dict / #[code None] | ||||
|         +cell Mapping of string forms to lemmas that bypass the #[code rules]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code rules] | ||||
|         +cell dict / #[code None] | ||||
|         +cell List of suffix rewrite rules. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code lookup_table] | ||||
|             +tag-new(2) | ||||
|         +cell dict / #[code None] | ||||
|         +cell The lemma lookup table, if available. | ||||
|  |  | |||
|  | @ -1,3 +1,7 @@ | |||
| //- 💫 DOCS > USAGE > TRAINING > TAGGER & PARSER | ||||
| 
 | ||||
| +under-construction | ||||
| 
 | ||||
| +h(3, "training-json") JSON format for training | ||||
| 
 | ||||
| include ../../api/_annotation/_training | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user