mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-13 13:17:06 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
5aab805c15
|
@ -130,10 +130,6 @@ cdef class Parser:
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
|
||||||
@property
|
|
||||||
def tok2vec(self):
|
|
||||||
return self.model.tok2vec
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def move_names(self):
|
def move_names(self):
|
||||||
names = []
|
names = []
|
||||||
|
|
|
@ -6,6 +6,7 @@ import pytest
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.pipeline import EntityRuler
|
from spacy.pipeline import EntityRuler
|
||||||
|
from spacy.tokens import Span
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -171,3 +172,31 @@ def test_preserving_links_asdoc(nlp):
|
||||||
for s_ent in sent_doc.ents:
|
for s_ent in sent_doc.ents:
|
||||||
if s_ent.text == orig_text:
|
if s_ent.text == orig_text:
|
||||||
assert s_ent.kb_id_ == orig_kb_id
|
assert s_ent.kb_id_ == orig_kb_id
|
||||||
|
|
||||||
|
|
||||||
|
def test_preserving_links_ents(nlp):
|
||||||
|
"""Test that doc.ents preserves KB annotations"""
|
||||||
|
text = "She lives in Boston. He lives in Denver."
|
||||||
|
doc = nlp(text)
|
||||||
|
assert len(list(doc.ents)) == 0
|
||||||
|
|
||||||
|
boston_ent = Span(doc, 3, 4, label="LOC", kb_id="Q1")
|
||||||
|
doc.ents = [boston_ent]
|
||||||
|
assert len(list(doc.ents)) == 1
|
||||||
|
assert list(doc.ents)[0].label_ == "LOC"
|
||||||
|
assert list(doc.ents)[0].kb_id_ == "Q1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_preserving_links_ents_2(nlp):
|
||||||
|
"""Test that doc.ents preserves KB annotations"""
|
||||||
|
text = "She lives in Boston. He lives in Denver."
|
||||||
|
doc = nlp(text)
|
||||||
|
assert len(list(doc.ents)) == 0
|
||||||
|
|
||||||
|
loc = doc.vocab.strings.add("LOC")
|
||||||
|
q1 = doc.vocab.strings.add("Q1")
|
||||||
|
|
||||||
|
doc.ents = [(loc, q1, 3, 4)]
|
||||||
|
assert len(list(doc.ents)) == 1
|
||||||
|
assert list(doc.ents)[0].label_ == "LOC"
|
||||||
|
assert list(doc.ents)[0].kb_id_ == "Q1"
|
||||||
|
|
|
@ -146,6 +146,7 @@ def _merge(Doc doc, merges):
|
||||||
syntactic root of the span.
|
syntactic root of the span.
|
||||||
RETURNS (Token): The first newly merged token.
|
RETURNS (Token): The first newly merged token.
|
||||||
"""
|
"""
|
||||||
|
cdef int i, merge_index, start, end, token_index
|
||||||
cdef Span span
|
cdef Span span
|
||||||
cdef const LexemeC* lex
|
cdef const LexemeC* lex
|
||||||
cdef TokenC* token
|
cdef TokenC* token
|
||||||
|
|
|
@ -534,7 +534,7 @@ cdef class Doc:
|
||||||
cdef attr_t entity_type
|
cdef attr_t entity_type
|
||||||
cdef int ent_start, ent_end
|
cdef int ent_start, ent_end
|
||||||
for ent_info in ents:
|
for ent_info in ents:
|
||||||
entity_type, ent_start, ent_end = get_entity_info(ent_info)
|
entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
|
||||||
for token_index in range(ent_start, ent_end):
|
for token_index in range(ent_start, ent_end):
|
||||||
if token_index in tokens_in_ents.keys():
|
if token_index in tokens_in_ents.keys():
|
||||||
raise ValueError(Errors.E103.format(
|
raise ValueError(Errors.E103.format(
|
||||||
|
@ -542,7 +542,7 @@ cdef class Doc:
|
||||||
tokens_in_ents[token_index][1],
|
tokens_in_ents[token_index][1],
|
||||||
self.vocab.strings[tokens_in_ents[token_index][2]]),
|
self.vocab.strings[tokens_in_ents[token_index][2]]),
|
||||||
span2=(ent_start, ent_end, self.vocab.strings[entity_type])))
|
span2=(ent_start, ent_end, self.vocab.strings[entity_type])))
|
||||||
tokens_in_ents[token_index] = (ent_start, ent_end, entity_type)
|
tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
self.c[i].ent_type = 0
|
self.c[i].ent_type = 0
|
||||||
|
@ -551,16 +551,18 @@ cdef class Doc:
|
||||||
cdef attr_t ent_type
|
cdef attr_t ent_type
|
||||||
cdef int start, end
|
cdef int start, end
|
||||||
for ent_info in ents:
|
for ent_info in ents:
|
||||||
ent_type, start, end = get_entity_info(ent_info)
|
ent_type, ent_kb_id, start, end = get_entity_info(ent_info)
|
||||||
if ent_type is None or ent_type < 0:
|
if ent_type is None or ent_type < 0:
|
||||||
# Mark as O
|
# Mark as O
|
||||||
for i in range(start, end):
|
for i in range(start, end):
|
||||||
self.c[i].ent_type = 0
|
self.c[i].ent_type = 0
|
||||||
|
self.c[i].ent_kb_id = 0
|
||||||
self.c[i].ent_iob = 2
|
self.c[i].ent_iob = 2
|
||||||
else:
|
else:
|
||||||
# Mark (inside) as I
|
# Mark (inside) as I
|
||||||
for i in range(start, end):
|
for i in range(start, end):
|
||||||
self.c[i].ent_type = ent_type
|
self.c[i].ent_type = ent_type
|
||||||
|
self.c[i].ent_kb_id = ent_kb_id
|
||||||
self.c[i].ent_iob = 1
|
self.c[i].ent_iob = 1
|
||||||
# Set start as B
|
# Set start as B
|
||||||
self.c[start].ent_iob = 3
|
self.c[start].ent_iob = 3
|
||||||
|
@ -1251,10 +1253,14 @@ def fix_attributes(doc, attributes):
|
||||||
def get_entity_info(ent_info):
|
def get_entity_info(ent_info):
|
||||||
if isinstance(ent_info, Span):
|
if isinstance(ent_info, Span):
|
||||||
ent_type = ent_info.label
|
ent_type = ent_info.label
|
||||||
|
ent_kb_id = ent_info.kb_id
|
||||||
start = ent_info.start
|
start = ent_info.start
|
||||||
end = ent_info.end
|
end = ent_info.end
|
||||||
elif len(ent_info) == 3:
|
elif len(ent_info) == 3:
|
||||||
ent_type, start, end = ent_info
|
ent_type, start, end = ent_info
|
||||||
|
ent_kb_id = 0
|
||||||
|
elif len(ent_info) == 4:
|
||||||
|
ent_type, ent_kb_id, start, end = ent_info
|
||||||
else:
|
else:
|
||||||
ent_id, ent_type, start, end = ent_info
|
ent_id, ent_kb_id, ent_type, start, end = ent_info
|
||||||
return ent_type, start, end
|
return ent_type, ent_kb_id, start, end
|
||||||
|
|
|
@ -186,63 +186,63 @@ The German part-of-speech tagger uses the
|
||||||
annotation scheme. We also map the tags to the simpler Google Universal POS tag
|
annotation scheme. We also map the tags to the simpler Google Universal POS tag
|
||||||
set.
|
set.
|
||||||
|
|
||||||
| Tag | POS | Morphology | Description |
|
| Tag | POS | Morphology | Description |
|
||||||
| --------- | ------- | ------------------------------------------- | ------------------------------------------------- |
|
| --------- | ------- | ---------------------------------------- | ------------------------------------------------- |
|
||||||
| `$(` | `PUNCT` | `PunctType=brck` | other sentence-internal punctuation mark |
|
| `$(` | `PUNCT` | `PunctType=brck` | other sentence-internal punctuation mark |
|
||||||
| `$,` | `PUNCT` | `PunctType=comm` | comma |
|
| `$,` | `PUNCT` | `PunctType=comm` | comma |
|
||||||
| `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark |
|
| `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark |
|
||||||
| `ADJA` | `ADJ` | | adjective, attributive |
|
| `ADJA` | `ADJ` | | adjective, attributive |
|
||||||
| `ADJD` | `ADJ` | `Variant=short` | adjective, adverbial or predicative |
|
| `ADJD` | `ADJ` | `Variant=short` | adjective, adverbial or predicative |
|
||||||
| `ADV` | `ADV` | | adverb |
|
| `ADV` | `ADV` | | adverb |
|
||||||
| `APPO` | `ADP` | `AdpType=post` | postposition |
|
| `APPO` | `ADP` | `AdpType=post` | postposition |
|
||||||
| `APPR` | `ADP` | `AdpType=prep` | preposition; circumposition left |
|
| `APPR` | `ADP` | `AdpType=prep` | preposition; circumposition left |
|
||||||
| `APPRART` | `ADP` | `AdpType=prep PronType=art` | preposition with article |
|
| `APPRART` | `ADP` | `AdpType=prep PronType=art` | preposition with article |
|
||||||
| `APZR` | `ADP` | `AdpType=circ` | circumposition right |
|
| `APZR` | `ADP` | `AdpType=circ` | circumposition right |
|
||||||
| `ART` | `DET` | `PronType=art` | definite or indefinite article |
|
| `ART` | `DET` | `PronType=art` | definite or indefinite article |
|
||||||
| `CARD` | `NUM` | `NumType=card` | cardinal number |
|
| `CARD` | `NUM` | `NumType=card` | cardinal number |
|
||||||
| `FM` | `X` | `Foreign=yes` | foreign language material |
|
| `FM` | `X` | `Foreign=yes` | foreign language material |
|
||||||
| `ITJ` | `INTJ` | | interjection |
|
| `ITJ` | `INTJ` | | interjection |
|
||||||
| `KOKOM` | `CONJ` | `ConjType=comp` | comparative conjunction |
|
| `KOKOM` | `CONJ` | `ConjType=comp` | comparative conjunction |
|
||||||
| `KON` | `CONJ` | | coordinate conjunction |
|
| `KON` | `CONJ` | | coordinate conjunction |
|
||||||
| `KOUI` | `SCONJ` | | subordinate conjunction with "zu" and infinitive |
|
| `KOUI` | `SCONJ` | | subordinate conjunction with "zu" and infinitive |
|
||||||
| `KOUS` | `SCONJ` | | subordinate conjunction with sentence |
|
| `KOUS` | `SCONJ` | | subordinate conjunction with sentence |
|
||||||
| `NE` | `PROPN` | | proper noun |
|
| `NE` | `PROPN` | | proper noun |
|
||||||
| `NNE` | `PROPN` | | proper noun |
|
| `NNE` | `PROPN` | | proper noun |
|
||||||
| `NN` | `NOUN` | | noun, singular or mass |
|
| `NN` | `NOUN` | | noun, singular or mass |
|
||||||
| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb |
|
| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb |
|
||||||
| `PDAT` | `DET` | `PronType=dem` | attributive demonstrative pronoun |
|
| `PDAT` | `DET` | `PronType=dem` | attributive demonstrative pronoun |
|
||||||
| `PDS` | `PRON` | `PronType=dem` | substituting demonstrative pronoun |
|
| `PDS` | `PRON` | `PronType=dem` | substituting demonstrative pronoun |
|
||||||
| `PIAT` | `DET` | `PronType=ind\|neg\|tot` | attributive indefinite pronoun without determiner |
|
| `PIAT` | `DET` | `PronType=ind\|neg\|tot` | attributive indefinite pronoun without determiner |
|
||||||
| `PIS` | `PRON` | `PronType=ind\|neg\|tot` | substituting indefinite pronoun |
|
| `PIS` | `PRON` | `PronType=ind\|neg\|tot` | substituting indefinite pronoun |
|
||||||
| `PPER` | `PRON` | `PronType=prs` | non-reflexive personal pronoun |
|
| `PPER` | `PRON` | `PronType=prs` | non-reflexive personal pronoun |
|
||||||
| `PPOSAT` | `DET` | `Poss=yes PronType=prs` | attributive possessive pronoun |
|
| `PPOSAT` | `DET` | `Poss=yes PronType=prs` | attributive possessive pronoun |
|
||||||
| `PPOSS` | `PRON` | `PronType=rel` | substituting possessive pronoun |
|
| `PPOSS` | `PRON` | `PronType=rel` | substituting possessive pronoun |
|
||||||
| `PRELAT` | `DET` | `PronType=rel` | attributive relative pronoun |
|
| `PRELAT` | `DET` | `PronType=rel` | attributive relative pronoun |
|
||||||
| `PRELS` | `PRON` | `PronType=rel` | substituting relative pronoun |
|
| `PRELS` | `PRON` | `PronType=rel` | substituting relative pronoun |
|
||||||
| `PRF` | `PRON` | `PronType=prs Reflex=yes` | reflexive personal pronoun |
|
| `PRF` | `PRON` | `PronType=prs Reflex=yes` | reflexive personal pronoun |
|
||||||
| `PTKA` | `PART` | | particle with adjective or adverb |
|
| `PTKA` | `PART` | | particle with adjective or adverb |
|
||||||
| `PTKANT` | `PART` | `PartType=res` | answer particle |
|
| `PTKANT` | `PART` | `PartType=res` | answer particle |
|
||||||
| `PTKNEG` | `PART` | `Negative=yes` | negative particle |
|
| `PTKNEG` | `PART` | `Negative=yes` | negative particle |
|
||||||
| `PTKVZ` | `PART` | `PartType=vbp` | separable verbal particle |
|
| `PTKVZ` | `PART` | `PartType=vbp` | separable verbal particle |
|
||||||
| `PTKZU` | `PART` | `PartType=inf` | "zu" before infinitive |
|
| `PTKZU` | `PART` | `PartType=inf` | "zu" before infinitive |
|
||||||
| `PWAT` | `DET` | `PronType=int` | attributive interrogative pronoun |
|
| `PWAT` | `DET` | `PronType=int` | attributive interrogative pronoun |
|
||||||
| `PWAV` | `ADV` | `PronType=int` | adverbial interrogative or relative pronoun |
|
| `PWAV` | `ADV` | `PronType=int` | adverbial interrogative or relative pronoun |
|
||||||
| `PWS` | `PRON` | `PronType=int` | substituting interrogative pronoun |
|
| `PWS` | `PRON` | `PronType=int` | substituting interrogative pronoun |
|
||||||
| `TRUNC` | `X` | `Hyph=yes` | word remnant |
|
| `TRUNC` | `X` | `Hyph=yes` | word remnant |
|
||||||
| `VAFIN` | `AUX` | `Mood=ind VerbForm=fin` | finite verb, auxiliary |
|
| `VAFIN` | `AUX` | `Mood=ind VerbForm=fin` | finite verb, auxiliary |
|
||||||
| `VAIMP` | `AUX` | `Mood=imp VerbForm=fin` | imperative, auxiliary |
|
| `VAIMP` | `AUX` | `Mood=imp VerbForm=fin` | imperative, auxiliary |
|
||||||
| `VAINF` | `AUX` | `VerbForm=inf` | infinitive, auxiliary |
|
| `VAINF` | `AUX` | `VerbForm=inf` | infinitive, auxiliary |
|
||||||
| `VAPP` | `AUX` | `Aspect=perf VerbForm=fin` | perfect participle, auxiliary |
|
| `VAPP` | `AUX` | `Aspect=perf VerbForm=fin` | perfect participle, auxiliary |
|
||||||
| `VMFIN` | `VERB` | `Mood=ind VerbForm=fin VerbType=mod` | finite verb, modal |
|
| `VMFIN` | `VERB` | `Mood=ind VerbForm=fin VerbType=mod` | finite verb, modal |
|
||||||
| `VMINF` | `VERB` | `VerbForm=fin VerbType=mod` | infinitive, modal |
|
| `VMINF` | `VERB` | `VerbForm=fin VerbType=mod` | infinitive, modal |
|
||||||
| `VMPP` | `VERB` | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal |
|
| `VMPP` | `VERB` | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal |
|
||||||
| `VVFIN` | `VERB` | `Mood=ind VerbForm=fin` | finite verb, full |
|
| `VVFIN` | `VERB` | `Mood=ind VerbForm=fin` | finite verb, full |
|
||||||
| `VVIMP` | `VERB` | `Mood=imp VerbForm=fin` | imperative, full |
|
| `VVIMP` | `VERB` | `Mood=imp VerbForm=fin` | imperative, full |
|
||||||
| `VVINF` | `VERB` | `VerbForm=inf` | infinitive, full |
|
| `VVINF` | `VERB` | `VerbForm=inf` | infinitive, full |
|
||||||
| `VVIZU` | `VERB` | `VerbForm=inf` | infinitive with "zu", full |
|
| `VVIZU` | `VERB` | `VerbForm=inf` | infinitive with "zu", full |
|
||||||
| `VVPP` | `VERB` | `Aspect=perf VerbForm=part` | perfect participle, full |
|
| `VVPP` | `VERB` | `Aspect=perf VerbForm=part` | perfect participle, full |
|
||||||
| `XY` | `X` | | non-word containing non-letter |
|
| `XY` | `X` | | non-word containing non-letter |
|
||||||
| `SP` | `SPACE` | | space |
|
| `SP` | `SPACE` | | space |
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
@ -379,51 +379,51 @@ The German dependency labels use the
|
||||||
[TIGER Treebank](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html)
|
[TIGER Treebank](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html)
|
||||||
annotation scheme.
|
annotation scheme.
|
||||||
|
|
||||||
| Label | Description |
|
| Label | Description |
|
||||||
| ------ | ------------------------------- |
|
| ------- | ------------------------------- |
|
||||||
| `ac` | adpositional case marker |
|
| `ac` | adpositional case marker |
|
||||||
| `adc` | adjective component |
|
| `adc` | adjective component |
|
||||||
| `ag` | genitive attribute |
|
| `ag` | genitive attribute |
|
||||||
| `ams` | measure argument of adjective |
|
| `ams` | measure argument of adjective |
|
||||||
| `app` | apposition |
|
| `app` | apposition |
|
||||||
| `avc` | adverbial phrase component |
|
| `avc` | adverbial phrase component |
|
||||||
| `cc` | comparative complement |
|
| `cc` | comparative complement |
|
||||||
| `cd` | coordinating conjunction |
|
| `cd` | coordinating conjunction |
|
||||||
| `cj` | conjunct |
|
| `cj` | conjunct |
|
||||||
| `cm` | comparative conjunction |
|
| `cm` | comparative conjunction |
|
||||||
| `cp` | complementizer |
|
| `cp` | complementizer |
|
||||||
| `cvc` | collocational verb construction |
|
| `cvc` | collocational verb construction |
|
||||||
| `da` | dative |
|
| `da` | dative |
|
||||||
| `dm` | discourse marker |
|
| `dm` | discourse marker |
|
||||||
| `ep` | expletive es |
|
| `ep` | expletive es |
|
||||||
| `ju` | junctor |
|
| `ju` | junctor |
|
||||||
| `mnr` | postnominal modifier |
|
| `mnr` | postnominal modifier |
|
||||||
| `mo` | modifier |
|
| `mo` | modifier |
|
||||||
| `ng` | negation |
|
| `ng` | negation |
|
||||||
| `nk` | noun kernel element |
|
| `nk` | noun kernel element |
|
||||||
| `nmc` | numerical component |
|
| `nmc` | numerical component |
|
||||||
| `oa` | accusative object |
|
| `oa` | accusative object |
|
||||||
| `oa2` | second accusative object |
|
| `oa2` | second accusative object |
|
||||||
| `oc` | clausal object |
|
| `oc` | clausal object |
|
||||||
| `og` | genitive object |
|
| `og` | genitive object |
|
||||||
| `op` | prepositional object |
|
| `op` | prepositional object |
|
||||||
| `par` | parenthetical element |
|
| `par` | parenthetical element |
|
||||||
| `pd` | predicate |
|
| `pd` | predicate |
|
||||||
| `pg` | phrasal genitive |
|
| `pg` | phrasal genitive |
|
||||||
| `ph` | placeholder |
|
| `ph` | placeholder |
|
||||||
| `pm` | morphological particle |
|
| `pm` | morphological particle |
|
||||||
| `pnc` | proper noun component |
|
| `pnc` | proper noun component |
|
||||||
| `punct` | punctuation |
|
| `punct` | punctuation |
|
||||||
| `rc` | relative clause |
|
| `rc` | relative clause |
|
||||||
| `re` | repeated element |
|
| `re` | repeated element |
|
||||||
| `rs` | reported speech |
|
| `rs` | reported speech |
|
||||||
| `sb` | subject |
|
| `sb` | subject |
|
||||||
| `sbp` | passivized subject (PP) |
|
| `sbp` | passivized subject (PP) |
|
||||||
| `sp` | subject or predicate |
|
| `sp` | subject or predicate |
|
||||||
| `svp` | separable verb prefix |
|
| `svp` | separable verb prefix |
|
||||||
| `uc` | unit component |
|
| `uc` | unit component |
|
||||||
| `vo` | vocative |
|
| `vo` | vocative |
|
||||||
| `ROOT` | root |
|
| `ROOT` | root |
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
@ -584,8 +584,8 @@ data.
|
||||||
```python
|
```python
|
||||||
### Entry structure
|
### Entry structure
|
||||||
{
|
{
|
||||||
"orth": string,
|
"orth": string, # the word text
|
||||||
"id": int,
|
"id": int, # can correspond to row in vectors table
|
||||||
"lower": string,
|
"lower": string,
|
||||||
"norm": string,
|
"norm": string,
|
||||||
"shape": string
|
"shape": string
|
||||||
|
|
|
@ -174,12 +174,12 @@ All output files generated by this command are compatible with
|
||||||
|
|
||||||
<!-- TODO: document jsonl option – maybe update it? -->
|
<!-- TODO: document jsonl option – maybe update it? -->
|
||||||
|
|
||||||
| ID | Description |
|
| ID | Description |
|
||||||
| ------------------------------ | --------------------------------------------------------------- |
|
| ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `auto` | Automatically pick converter based on file extension and file content (default). |
|
| `auto` | Automatically pick converter based on file extension and file content (default). |
|
||||||
| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. |
|
| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. |
|
||||||
| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
|
| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
|
||||||
| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
|
| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
|
||||||
|
|
||||||
## Train {#train}
|
## Train {#train}
|
||||||
|
|
||||||
|
@ -291,26 +291,26 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
|
||||||
[--seed] [--n-iter] [--use-vectors] [--n-save_every] [--init-tok2vec] [--epoch-start]
|
[--seed] [--n-iter] [--use-vectors] [--n-save_every] [--init-tok2vec] [--epoch-start]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ----------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](#pretrain-jsonl) for details. |
|
| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](#pretrain-jsonl) for details. |
|
||||||
| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. |
|
| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. |
|
||||||
| `output_dir` | positional | Directory to write models to on each epoch. |
|
| `output_dir` | positional | Directory to write models to on each epoch. |
|
||||||
| `--width`, `-cw` | option | Width of CNN layers. |
|
| `--width`, `-cw` | option | Width of CNN layers. |
|
||||||
| `--depth`, `-cd` | option | Depth of CNN layers. |
|
| `--depth`, `-cd` | option | Depth of CNN layers. |
|
||||||
| `--embed-rows`, `-er` | option | Number of embedding rows. |
|
| `--embed-rows`, `-er` | option | Number of embedding rows. |
|
||||||
| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"L2"` or `"cosine"`. |
|
| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"L2"` or `"cosine"`. |
|
||||||
| `--dropout`, `-d` | option | Dropout rate. |
|
| `--dropout`, `-d` | option | Dropout rate. |
|
||||||
| `--batch-size`, `-bs` | option | Number of words per training batch. |
|
| `--batch-size`, `-bs` | option | Number of words per training batch. |
|
||||||
| `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. |
|
| `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. |
|
||||||
| `--min-length`, `-nw` | option | Minimum words per example. Shorter examples are discarded. |
|
| `--min-length`, `-nw` | option | Minimum words per example. Shorter examples are discarded. |
|
||||||
| `--seed`, `-s` | option | Seed for random number generators. |
|
| `--seed`, `-s` | option | Seed for random number generators. |
|
||||||
| `--n-iter`, `-i` | option | Number of iterations to pretrain. |
|
| `--n-iter`, `-i` | option | Number of iterations to pretrain. |
|
||||||
| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. |
|
| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. |
|
||||||
| `--n-save-every`, `-se` | option | Save model every X batches. |
|
| `--n-save-every`, `-se` | option | Save model every X batches. |
|
||||||
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.|
|
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
|
||||||
| `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.|
|
| `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files. |
|
||||||
| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. |
|
| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. |
|
||||||
|
|
||||||
### JSONL format for raw text {#pretrain-jsonl}
|
### JSONL format for raw text {#pretrain-jsonl}
|
||||||
|
|
||||||
|
@ -330,10 +330,10 @@ tokenization can be provided.
|
||||||
> srsly.write_jsonl("/path/to/text.jsonl", data)
|
> srsly.write_jsonl("/path/to/text.jsonl", data)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Key | Type | Description |
|
| Key | Type | Description |
|
||||||
| -------- | ------- | -------------------------------------------- |
|
| -------- | ------- | ---------------------------------------------------------- |
|
||||||
| `text` | unicode | The raw input text. Is not required if `tokens` available. |
|
| `text` | unicode | The raw input text. Is not required if `tokens` available. |
|
||||||
| `tokens` | list | Optional tokenization, one string per token. |
|
| `tokens` | list | Optional tokenization, one string per token. |
|
||||||
|
|
||||||
```json
|
```json
|
||||||
### Example
|
### Example
|
||||||
|
@ -347,14 +347,17 @@ tokenization can be provided.
|
||||||
|
|
||||||
Create a new model directory from raw data, like word frequencies, Brown
|
Create a new model directory from raw data, like word frequencies, Brown
|
||||||
clusters and word vectors. This command is similar to the `spacy model` command
|
clusters and word vectors. This command is similar to the `spacy model` command
|
||||||
in v1.x.
|
in v1.x. Note that in order to populate the model's vocab, you need to pass in a
|
||||||
|
JSONL-formatted [vocabulary file](<(/api/annotation#vocab-jsonl)>) as
|
||||||
|
`--jsonl-loc` with optional `id` values that correspond to the vectors table.
|
||||||
|
Just loading in vectors will not automatically populate the vocab.
|
||||||
|
|
||||||
<Infobox title="Deprecation note" variant="warning">
|
<Infobox title="Deprecation note" variant="warning">
|
||||||
|
|
||||||
As of v2.1.0, the `--freqs-loc` and `--clusters-loc` are deprecated and have
|
As of v2.1.0, the `--freqs-loc` and `--clusters-loc` are deprecated and have
|
||||||
been replaced with the `--jsonl-loc` argument, which lets you pass in a a
|
been replaced with the `--jsonl-loc` argument, which lets you pass in a a
|
||||||
[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
|
[JSONL](http://jsonlines.org/) file containing one lexical entry per line. For
|
||||||
lexical entry per line. For more details on the format, see the
|
more details on the format, see the
|
||||||
[annotation specs](/api/annotation#vocab-jsonl).
|
[annotation specs](/api/annotation#vocab-jsonl).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -368,7 +371,7 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
|
||||||
| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
|
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
|
||||||
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
|
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
|
||||||
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted vocabulary file with lexical attributes. |
|
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
|
||||||
| `--vectors-loc`, `-v` | option | Optional location of vectors file. Should be a tab-separated file in Word2Vec format where the first column contains the word and the remaining columns the values. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
|
| `--vectors-loc`, `-v` | option | Optional location of vectors file. Should be a tab-separated file in Word2Vec format where the first column contains the word and the remaining columns the values. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
|
||||||
| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
||||||
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
||||||
|
@ -424,7 +427,7 @@ pip install dist/en_model-0.0.0.tar.gz
|
||||||
| `input_dir` | positional | Path to directory containing model data. |
|
| `input_dir` | positional | Path to directory containing model data. |
|
||||||
| `output_dir` | positional | Directory to create package folder in. |
|
| `output_dir` | positional | Directory to create package folder in. |
|
||||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | option | Path to `meta.json` file (optional). |
|
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | option | Path to `meta.json` file (optional). |
|
||||||
| `--create-meta`, `-c` <Tag variant="new">2</Tag> | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt.
|
| `--create-meta`, `-c` <Tag variant="new">2</Tag> | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. |
|
||||||
| `--force`, `-f` | flag | Force overwriting of existing folder in output directory. |
|
| `--force`, `-f` | flag | Force overwriting of existing folder in output directory. |
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
| **CREATES** | directory | A Python package containing the spaCy model. |
|
| **CREATES** | directory | A Python package containing the spaCy model. |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user