mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-08 22:03:24 +03:00
Update API docs with changes in spacy.gold and spacy.language
This commit is contained in:
parent
b5fb43fdd8
commit
54f04a9fe0
|
@ -142,9 +142,14 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
|
|
||||||
|
|
||||||
class GoldCorpus(object):
|
class GoldCorpus(object):
|
||||||
'''An annotated corpus, using the JSON file format. Manages
|
"""An annotated corpus, using the JSON file format. Manages
|
||||||
annotations for tagging, dependency parsing, NER.'''
|
annotations for tagging, dependency parsing and NER."""
|
||||||
def __init__(self, train_path, dev_path):
|
def __init__(self, train_path, dev_path):
|
||||||
|
"""Create a GoldCorpus.
|
||||||
|
|
||||||
|
train_path (unicode or Path): File or directory of training data.
|
||||||
|
dev_path (unicode or Path): File or directory of development data.
|
||||||
|
"""
|
||||||
self.train_path = util.ensure_path(train_path)
|
self.train_path = util.ensure_path(train_path)
|
||||||
self.dev_path = util.ensure_path(dev_path)
|
self.dev_path = util.ensure_path(dev_path)
|
||||||
self.train_locs = self.walk_corpus(self.train_path)
|
self.train_locs = self.walk_corpus(self.train_path)
|
||||||
|
|
|
@ -236,6 +236,12 @@ class Language(object):
|
||||||
doc.tensor = None
|
doc.tensor = None
|
||||||
|
|
||||||
def preprocess_gold(self, docs_golds):
|
def preprocess_gold(self, docs_golds):
|
||||||
|
"""Can be called before training to pre-process gold data. By default,
|
||||||
|
it handles nonprojectivity and adds missing tags to the tag map.
|
||||||
|
|
||||||
|
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
|
||||||
|
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
|
||||||
|
"""
|
||||||
for proc in self.pipeline:
|
for proc in self.pipeline:
|
||||||
if hasattr(proc, 'preprocess_gold'):
|
if hasattr(proc, 'preprocess_gold'):
|
||||||
docs_golds = proc.preprocess_gold(docs_golds)
|
docs_golds = proc.preprocess_gold(docs_golds)
|
||||||
|
|
|
@ -23,7 +23,8 @@
|
||||||
"Lexeme": "lexeme",
|
"Lexeme": "lexeme",
|
||||||
"Vocab": "vocab",
|
"Vocab": "vocab",
|
||||||
"StringStore": "stringstore",
|
"StringStore": "stringstore",
|
||||||
"GoldParse": "goldparse"
|
"GoldParse": "goldparse",
|
||||||
|
"GoldCorpus": "goldcorpus"
|
||||||
},
|
},
|
||||||
"Other": {
|
"Other": {
|
||||||
"Annotation Specs": "annotation",
|
"Annotation Specs": "annotation",
|
||||||
|
@ -135,6 +136,11 @@
|
||||||
"tag": "class"
|
"tag": "class"
|
||||||
},
|
},
|
||||||
|
|
||||||
|
"goldcorpus": {
|
||||||
|
"title": "GoldCorpus",
|
||||||
|
"tag": "class"
|
||||||
|
},
|
||||||
|
|
||||||
"annotation": {
|
"annotation": {
|
||||||
"title": "Annotation Specifications"
|
"title": "Annotation Specifications"
|
||||||
},
|
},
|
||||||
|
|
23
website/docs/api/goldcorpus.jade
Normal file
23
website/docs/api/goldcorpus.jade
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
//- 💫 DOCS > API > GOLDCORPUS
|
||||||
|
|
||||||
|
include ../../_includes/_mixins
|
||||||
|
|
||||||
|
p
|
||||||
|
| An annotated corpus, using the JSON file format. Manages annotations for
|
||||||
|
| tagging, dependency parsing and NER.
|
||||||
|
|
||||||
|
+h(2, "init") GoldCorpus.__init__
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Create a #[code GoldCorpus].
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code train_path]
|
||||||
|
+cell unicode or #[code Path]
|
||||||
|
+cell File or directory of training data.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code dev_path]
|
||||||
|
+cell unicode or #[code Path]
|
||||||
|
+cell File or directory of development data.
|
|
@ -7,7 +7,7 @@ p Collection for training annotations.
|
||||||
+h(2, "init") GoldParse.__init__
|
+h(2, "init") GoldParse.__init__
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Create a GoldParse.
|
p Create a #[code GoldParse].
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
|
|
@ -82,6 +82,41 @@ p
|
||||||
+cell #[code Doc]
|
+cell #[code Doc]
|
||||||
+cell A container for accessing the annotations.
|
+cell A container for accessing the annotations.
|
||||||
|
|
||||||
|
+h(2, "pipe") Language.pipe
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Process texts as a stream, and yield #[code Doc] objects in order.
|
||||||
|
| Supports GIL-free multi-threading.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
texts = [u'One document.', u'...', u'Lots of documents']
|
||||||
|
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
||||||
|
assert doc.is_parsed
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code texts]
|
||||||
|
+cell -
|
||||||
|
+cell A sequence of unicode objects.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code n_threads]
|
||||||
|
+cell int
|
||||||
|
+cell
|
||||||
|
| The number of worker threads to use. If #[code -1], OpenMP will
|
||||||
|
| decide how many to use at run time. Default is #[code 2].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code batch_size]
|
||||||
|
+cell int
|
||||||
|
+cell The number of texts to buffer.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell yields
|
||||||
|
+cell #[code Doc]
|
||||||
|
+cell Documents in the order of the original text.
|
||||||
|
|
||||||
+h(2, "update") Language.update
|
+h(2, "update") Language.update
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
|
@ -172,40 +207,23 @@ p
|
||||||
+cell -
|
+cell -
|
||||||
+cell Config parameters.
|
+cell Config parameters.
|
||||||
|
|
||||||
+h(2, "pipe") Language.pipe
|
+h(2, "preprocess_gold") Language.preprocess_gold
|
||||||
+tag method
|
|
||||||
|
|
||||||
p
|
p
|
||||||
| Process texts as a stream, and yield #[code Doc] objects in order.
|
| Can be called before training to pre-process gold data. By default, it
|
||||||
| Supports GIL-free multi-threading.
|
| handles nonprojectivity and adds missing tags to the tag map.
|
||||||
|
|
||||||
+aside-code("Example").
|
|
||||||
texts = [u'One document.', u'...', u'Lots of documents']
|
|
||||||
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
|
||||||
assert doc.is_parsed
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code texts]
|
+cell #[code docs_golds]
|
||||||
+cell -
|
+cell iterable
|
||||||
+cell A sequence of unicode objects.
|
+cell Tuples of #[code Doc] and #[code GoldParse] objects.
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code n_threads]
|
|
||||||
+cell int
|
|
||||||
+cell
|
|
||||||
| The number of worker threads to use. If #[code -1], OpenMP will
|
|
||||||
| decide how many to use at run time. Default is #[code 2].
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code batch_size]
|
|
||||||
+cell int
|
|
||||||
+cell The number of texts to buffer.
|
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell yields
|
+cell yields
|
||||||
+cell #[code Doc]
|
+cell tuple
|
||||||
+cell Documents in the order of the original text.
|
+cell Tuples of #[code Doc] and #[code GoldParse] objects.
|
||||||
|
|
||||||
+h(2, "to_disk") Language.to_disk
|
+h(2, "to_disk") Language.to_disk
|
||||||
+tag method
|
+tag method
|
||||||
|
|
Loading…
Reference in New Issue
Block a user