mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-25 11:23:40 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
452902c74a
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a34,<8.0.0a40",
|
"thinc>=8.0.0a35,<8.0.0a40",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"pathy"
|
"pathy"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a34,<8.0.0a40
|
thinc>=8.0.0a35,<8.0.0a40
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets==0.2.0a0
|
ml_datasets==0.2.0a0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a34,<8.0.0a40
|
thinc>=8.0.0a35,<8.0.0a40
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a34,<8.0.0a40
|
thinc>=8.0.0a35,<8.0.0a40
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a25"
|
__version__ = "3.0.0a26"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -209,6 +209,8 @@ def walk_directory(path: Path, converter: str) -> List[Path]:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
locs.append(path)
|
locs.append(path)
|
||||||
|
# It's good to sort these, in case the ordering messes up cache.
|
||||||
|
locs.sort()
|
||||||
return locs
|
return locs
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -121,20 +121,19 @@ def train(
|
||||||
|
|
||||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||||
if weights_data is not None:
|
if weights_data is not None:
|
||||||
tok2vec_path = config["pretraining"].get("tok2vec_model", None)
|
tok2vec_component = config["pretraining"]["component"]
|
||||||
if tok2vec_path is None:
|
if tok2vec_component is None:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"To pretrained tok2vec weights, the config needs to specify which "
|
f"To use pretrained tok2vec weights, [pretraining.component] "
|
||||||
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
|
f"needs to specify the component that should load them.",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
tok2vec = config
|
layer = nlp.get_pipe(tok2vec_component).model
|
||||||
for subpath in tok2vec_path.split("."):
|
tok2vec_layer = config["pretraining"]["layer"]
|
||||||
tok2vec = tok2vec.get(subpath)
|
if tok2vec_layer:
|
||||||
if not tok2vec:
|
layer = layer.get_ref(tok2vec_layer)
|
||||||
err = f"Could not locate the tok2vec model at {tok2vec_path}"
|
layer.from_bytes(weights_data)
|
||||||
msg.fail(err, exits=1)
|
msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
|
||||||
tok2vec.from_bytes(weights_data)
|
|
||||||
|
|
||||||
# Create iterator, which yields out info after each optimization step.
|
# Create iterator, which yields out info after each optimization step.
|
||||||
msg.info("Start training")
|
msg.info("Start training")
|
||||||
|
|
|
@ -85,7 +85,7 @@ class Warnings:
|
||||||
"attribute or operator.")
|
"attribute or operator.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
W090 = ("Could not locate any binary .spacy files in path '{path}'.")
|
W090 = ("Could not locate any {format} files in path '{path}'.")
|
||||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||||
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||||
W093 = ("Could not find any data to train the {name} on. Is your "
|
W093 = ("Could not find any data to train the {name} on. Is your "
|
||||||
|
|
|
@ -79,23 +79,32 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/attributeruler#call
|
DOCS: https://nightly.spacy.io/api/attributeruler#call
|
||||||
"""
|
"""
|
||||||
matches = sorted(self.matcher(doc, allow_missing=True))
|
matches = self.matcher(doc, allow_missing=True)
|
||||||
|
# Sort by the attribute ID, so that later rules have precendence
|
||||||
for match_id, start, end in matches:
|
matches = [
|
||||||
|
(_parse_key(self.vocab.strings[m_id]), m_id, s, e)
|
||||||
|
for m_id, s, e in matches
|
||||||
|
]
|
||||||
|
matches.sort()
|
||||||
|
for attr_id, match_id, start, end in matches:
|
||||||
span = Span(doc, start, end, label=match_id)
|
span = Span(doc, start, end, label=match_id)
|
||||||
attrs = self.attrs[span.label]
|
attrs = self.attrs[attr_id]
|
||||||
index = self.indices[span.label]
|
index = self.indices[attr_id]
|
||||||
try:
|
try:
|
||||||
|
# The index can be negative, which makes it annoying to do
|
||||||
|
# the boundscheck. Let Span do it instead.
|
||||||
token = span[index]
|
token = span[index]
|
||||||
except IndexError:
|
except IndexError:
|
||||||
|
# The original exception is just our conditional logic, so we
|
||||||
|
# raise from.
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E1001.format(
|
Errors.E1001.format(
|
||||||
patterns=self.matcher.get(span.label),
|
patterns=self.matcher.get(span.label),
|
||||||
span=[t.text for t in span],
|
span=[t.text for t in span],
|
||||||
index=index,
|
index=index,
|
||||||
)
|
)
|
||||||
) from None
|
) from None
|
||||||
set_token_attrs(token, attrs)
|
set_token_attrs(span[index], attrs)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, *, batch_size=128):
|
def pipe(self, stream, *, batch_size=128):
|
||||||
|
@ -173,7 +182,10 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/attributeruler#add
|
DOCS: https://nightly.spacy.io/api/attributeruler#add
|
||||||
"""
|
"""
|
||||||
self.matcher.add(len(self.attrs), patterns)
|
# We need to make a string here, because otherwise the ID we pass back
|
||||||
|
# will be interpreted as the hash of a string, rather than an ordinal.
|
||||||
|
key = _make_key(len(self.attrs))
|
||||||
|
self.matcher.add(self.vocab.strings.add(key), patterns)
|
||||||
self._attrs_unnormed.append(attrs)
|
self._attrs_unnormed.append(attrs)
|
||||||
attrs = normalize_token_attrs(self.vocab, attrs)
|
attrs = normalize_token_attrs(self.vocab, attrs)
|
||||||
self.attrs.append(attrs)
|
self.attrs.append(attrs)
|
||||||
|
@ -197,7 +209,7 @@ class AttributeRuler(Pipe):
|
||||||
all_patterns = []
|
all_patterns = []
|
||||||
for i in range(len(self.attrs)):
|
for i in range(len(self.attrs)):
|
||||||
p = {}
|
p = {}
|
||||||
p["patterns"] = self.matcher.get(i)[1]
|
p["patterns"] = self.matcher.get(_make_key(i))[1]
|
||||||
p["attrs"] = self._attrs_unnormed[i]
|
p["attrs"] = self._attrs_unnormed[i]
|
||||||
p["index"] = self.indices[i]
|
p["index"] = self.indices[i]
|
||||||
all_patterns.append(p)
|
all_patterns.append(p)
|
||||||
|
@ -301,6 +313,12 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def _make_key(n_attr):
|
||||||
|
return f"attr_rule_{n_attr}"
|
||||||
|
|
||||||
|
def _parse_key(key):
|
||||||
|
return int(key.rsplit("_", 1)[1])
|
||||||
|
|
||||||
|
|
||||||
def _split_morph_attrs(attrs):
|
def _split_morph_attrs(attrs):
|
||||||
"""Split entries from a tag map or morph rules dict into to two dicts, one
|
"""Split entries from a tag map or morph rules dict into to two dicts, one
|
||||||
|
|
|
@ -60,49 +60,6 @@ cdef struct MorphAnalysisC:
|
||||||
hash_t key
|
hash_t key
|
||||||
int length
|
int length
|
||||||
|
|
||||||
attr_t abbr
|
|
||||||
attr_t adp_type
|
|
||||||
attr_t adv_type
|
|
||||||
attr_t animacy
|
|
||||||
attr_t aspect
|
|
||||||
attr_t case
|
|
||||||
attr_t conj_type
|
|
||||||
attr_t connegative
|
|
||||||
attr_t definite
|
|
||||||
attr_t degree
|
|
||||||
attr_t derivation
|
|
||||||
attr_t echo
|
|
||||||
attr_t foreign
|
|
||||||
attr_t gender
|
|
||||||
attr_t hyph
|
|
||||||
attr_t inf_form
|
|
||||||
attr_t mood
|
|
||||||
attr_t negative
|
|
||||||
attr_t number
|
|
||||||
attr_t name_type
|
|
||||||
attr_t noun_type
|
|
||||||
attr_t num_form
|
|
||||||
attr_t num_type
|
|
||||||
attr_t num_value
|
|
||||||
attr_t part_form
|
|
||||||
attr_t part_type
|
|
||||||
attr_t person
|
|
||||||
attr_t polite
|
|
||||||
attr_t polarity
|
|
||||||
attr_t poss
|
|
||||||
attr_t prefix
|
|
||||||
attr_t prep_case
|
|
||||||
attr_t pron_type
|
|
||||||
attr_t punct_side
|
|
||||||
attr_t punct_type
|
|
||||||
attr_t reflex
|
|
||||||
attr_t style
|
|
||||||
attr_t style_variant
|
|
||||||
attr_t tense
|
|
||||||
attr_t typo
|
|
||||||
attr_t verb_form
|
|
||||||
attr_t voice
|
|
||||||
attr_t verb_type
|
|
||||||
attr_t* fields
|
attr_t* fields
|
||||||
attr_t* features
|
attr_t* features
|
||||||
|
|
||||||
|
|
|
@ -171,7 +171,7 @@ class DocBin:
|
||||||
"tokens": tokens.tobytes("C"),
|
"tokens": tokens.tobytes("C"),
|
||||||
"spaces": spaces.tobytes("C"),
|
"spaces": spaces.tobytes("C"),
|
||||||
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
|
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
|
||||||
"strings": list(self.strings),
|
"strings": list(sorted(self.strings)),
|
||||||
"cats": self.cats,
|
"cats": self.cats,
|
||||||
"flags": self.flags,
|
"flags": self.flags,
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,7 +49,9 @@ def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
|
||||||
elif path.parts[-1].endswith(file_type):
|
elif path.parts[-1].endswith(file_type):
|
||||||
locs.append(path)
|
locs.append(path)
|
||||||
if len(locs) == 0:
|
if len(locs) == 0:
|
||||||
warnings.warn(Warnings.W090.format(path=orig_path))
|
warnings.warn(Warnings.W090.format(path=orig_path, format=file_type))
|
||||||
|
# It's good to sort these, in case the ordering messes up a cache.
|
||||||
|
locs.sort()
|
||||||
return locs
|
return locs
|
||||||
|
|
||||||
|
|
||||||
|
@ -200,7 +202,7 @@ class JsonlTexts:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
|
DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
|
||||||
"""
|
"""
|
||||||
for loc in walk_corpus(self.path, "jsonl"):
|
for loc in walk_corpus(self.path, ".jsonl"):
|
||||||
records = srsly.read_jsonl(loc)
|
records = srsly.read_jsonl(loc)
|
||||||
for record in records:
|
for record in records:
|
||||||
doc = nlp.make_doc(record["text"])
|
doc = nlp.make_doc(record["text"])
|
||||||
|
|
|
@ -62,7 +62,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
"""Read Example dictionaries from a json file or directory."""
|
"""Read Example dictionaries from a json file or directory."""
|
||||||
loc = util.ensure_path(loc)
|
loc = util.ensure_path(loc)
|
||||||
if loc.is_dir():
|
if loc.is_dir():
|
||||||
for filename in loc.iterdir():
|
for filename in sorted(loc.iterdir()):
|
||||||
yield from read_json_file(loc / filename, limit=limit)
|
yield from read_json_file(loc / filename, limit=limit)
|
||||||
else:
|
else:
|
||||||
with loc.open("rb") as file_:
|
with loc.open("rb") as file_:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user