mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	More formatting changes
This commit is contained in:
		
							parent
							
								
									c22f075509
								
							
						
					
					
						commit
						a892821c51
					
				
							
								
								
									
										6
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
import contextlib
 | 
					import contextlib
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from fabric.api import local, lcd, env, settings, prefix
 | 
					from fabric.api import local, lcd
 | 
				
			||||||
from os import path, environ
 | 
					from os import path, environ
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
| 
						 | 
					@ -79,9 +79,7 @@ def pex():
 | 
				
			||||||
    with virtualenv(VENV_DIR) as venv_local:
 | 
					    with virtualenv(VENV_DIR) as venv_local:
 | 
				
			||||||
        with lcd(path.dirname(__file__)):
 | 
					        with lcd(path.dirname(__file__)):
 | 
				
			||||||
            sha = local("git rev-parse --short HEAD", capture=True)
 | 
					            sha = local("git rev-parse --short HEAD", capture=True)
 | 
				
			||||||
            venv_local(
 | 
					            venv_local(f"pex dist/*.whl -e spacy -o dist/spacy-{sha}.pex", direct=True)
 | 
				
			||||||
                "pex dist/*.whl -e spacy -o dist/spacy-%s.pex" % sha, direct=True
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def clean():
 | 
					def clean():
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -22,7 +22,7 @@ if __name__ == "__main__":
 | 
				
			||||||
    if len(sys.argv) == 1:
 | 
					    if len(sys.argv) == 1:
 | 
				
			||||||
        msg.info("Available commands", ", ".join(commands), exits=1)
 | 
					        msg.info("Available commands", ", ".join(commands), exits=1)
 | 
				
			||||||
    command = sys.argv.pop(1)
 | 
					    command = sys.argv.pop(1)
 | 
				
			||||||
    sys.argv[0] = "spacy %s" % command
 | 
					    sys.argv[0] = f"spacy {command}"
 | 
				
			||||||
    if command in commands:
 | 
					    if command in commands:
 | 
				
			||||||
        plac.call(commands[command], sys.argv[1:])
 | 
					        plac.call(commands[command], sys.argv[1:])
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -296,7 +296,7 @@ def link_vectors_to_models(vocab):
 | 
				
			||||||
            # This is a hack to avoid the problem in #3853. Maybe we should
 | 
					            # This is a hack to avoid the problem in #3853. Maybe we should
 | 
				
			||||||
            # print a warning as well?
 | 
					            # print a warning as well?
 | 
				
			||||||
            old_name = vectors.name
 | 
					            old_name = vectors.name
 | 
				
			||||||
            new_name = vectors.name + "_%d" % data.shape[0]
 | 
					            new_name = f"{vectors.name}_{data.shape[0]}"
 | 
				
			||||||
            user_warning(Warnings.W019.format(old=old_name, new=new_name))
 | 
					            user_warning(Warnings.W019.format(old=old_name, new=new_name))
 | 
				
			||||||
            vectors.name = new_name
 | 
					            vectors.name = new_name
 | 
				
			||||||
            key = (ops.device, vectors.name)
 | 
					            key = (ops.device, vectors.name)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -47,20 +47,20 @@ def evaluate(
 | 
				
			||||||
    end = timer()
 | 
					    end = timer()
 | 
				
			||||||
    nwords = sum(len(ex.doc) for ex in dev_dataset)
 | 
					    nwords = sum(len(ex.doc) for ex in dev_dataset)
 | 
				
			||||||
    results = {
 | 
					    results = {
 | 
				
			||||||
        "Time": "%.2f s" % (end - begin),
 | 
					        "Time": f"{end - begin:.2f} s",
 | 
				
			||||||
        "Words": nwords,
 | 
					        "Words": nwords,
 | 
				
			||||||
        "Words/s": "%.0f" % (nwords / (end - begin)),
 | 
					        "Words/s": f"{nwords / (end - begin):.0f}",
 | 
				
			||||||
        "TOK": "%.2f" % scorer.token_acc,
 | 
					        "TOK": f"{scorer.token_acc:.2f}",
 | 
				
			||||||
        "POS": "%.2f" % scorer.tags_acc,
 | 
					        "POS": f"{scorer.tags_acc:.2f}",
 | 
				
			||||||
        "UAS": "%.2f" % scorer.uas,
 | 
					        "UAS": f"{scorer.uas:.2f}",
 | 
				
			||||||
        "LAS": "%.2f" % scorer.las,
 | 
					        "LAS": f"{scorer.las:.2f}",
 | 
				
			||||||
        "NER P": "%.2f" % scorer.ents_p,
 | 
					        "NER P": f"{scorer.ents_p:.2f}",
 | 
				
			||||||
        "NER R": "%.2f" % scorer.ents_r,
 | 
					        "NER R": f"{scorer.ents_r:.2f}",
 | 
				
			||||||
        "NER F": "%.2f" % scorer.ents_f,
 | 
					        "NER F": f"{scorer.ents_f:.2f}",
 | 
				
			||||||
        "Textcat": "%.2f" % scorer.textcat_score,
 | 
					        "Textcat": f"{scorer.textcat_score:.2f}",
 | 
				
			||||||
        "Sent P": "%.2f" % scorer.sent_p,
 | 
					        "Sent P": f"{scorer.sent_p:.2f}",
 | 
				
			||||||
        "Sent R": "%.2f" % scorer.sent_r,
 | 
					        "Sent R": f"{scorer.sent_r:.2f}",
 | 
				
			||||||
        "Sent F": "%.2f" % scorer.sent_f,
 | 
					        "Sent F": f"{scorer.sent_f:.2f}",
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    msg.table(results, title="Results")
 | 
					    msg.table(results, title="Results")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -186,7 +186,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
 | 
				
			||||||
        if vectors_data is not None:
 | 
					        if vectors_data is not None:
 | 
				
			||||||
            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
 | 
					            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
 | 
				
			||||||
    if name is None:
 | 
					    if name is None:
 | 
				
			||||||
        nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
 | 
					        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_model.vectors"
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        nlp.vocab.vectors.name = name
 | 
					        nlp.vocab.vectors.name = name
 | 
				
			||||||
    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
 | 
					    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
 | 
				
			||||||
| 
						 | 
					@ -232,7 +232,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
 | 
				
			||||||
                    word = literal_eval(key)
 | 
					                    word = literal_eval(key)
 | 
				
			||||||
                except SyntaxError:
 | 
					                except SyntaxError:
 | 
				
			||||||
                    # Take odd strings literally.
 | 
					                    # Take odd strings literally.
 | 
				
			||||||
                    word = literal_eval("'%s'" % key)
 | 
					                    word = literal_eval(f"'{key}'")
 | 
				
			||||||
                smooth_count = counts.smoother(int(freq))
 | 
					                smooth_count = counts.smoother(int(freq))
 | 
				
			||||||
                probs[word] = math.log(smooth_count) - log_total
 | 
					                probs[word] = math.log(smooth_count) - log_total
 | 
				
			||||||
    oov_prob = math.log(counts.smoother(0)) - log_total
 | 
					    oov_prob = math.log(counts.smoother(0)) - log_total
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -83,7 +83,7 @@ def generate_meta(model_path, existing_meta, msg):
 | 
				
			||||||
        ("lang", "Model language", meta.get("lang", "en")),
 | 
					        ("lang", "Model language", meta.get("lang", "en")),
 | 
				
			||||||
        ("name", "Model name", meta.get("name", "model")),
 | 
					        ("name", "Model name", meta.get("name", "model")),
 | 
				
			||||||
        ("version", "Model version", meta.get("version", "0.0.0")),
 | 
					        ("version", "Model version", meta.get("version", "0.0.0")),
 | 
				
			||||||
        ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
 | 
					        ("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"),
 | 
				
			||||||
        ("description", "Model description", meta.get("description", False)),
 | 
					        ("description", "Model description", meta.get("description", False)),
 | 
				
			||||||
        ("author", "Author", meta.get("author", False)),
 | 
					        ("author", "Author", meta.get("author", False)),
 | 
				
			||||||
        ("email", "Author email", meta.get("email", False)),
 | 
					        ("email", "Author email", meta.get("email", False)),
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -179,14 +179,12 @@ def pretrain(
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            if not epoch_start:
 | 
					            if not epoch_start:
 | 
				
			||||||
                msg.fail(
 | 
					                msg.fail(
 | 
				
			||||||
                    "You have to use the '--epoch-start' argument when using a renamed weight file for "
 | 
					                    "You have to use the --epoch-start argument when using a renamed weight file for --init-tok2vec",
 | 
				
			||||||
                    "'--init-tok2vec'",
 | 
					 | 
				
			||||||
                    exits=True,
 | 
					                    exits=True,
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            elif epoch_start < 0:
 | 
					            elif epoch_start < 0:
 | 
				
			||||||
                msg.fail(
 | 
					                msg.fail(
 | 
				
			||||||
                    "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid"
 | 
					                    f"The argument --epoch-start has to be greater or equal to 0. {epoch_start} is invalid",
 | 
				
			||||||
                    % epoch_start,
 | 
					 | 
				
			||||||
                    exits=True,
 | 
					                    exits=True,
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
| 
						 | 
					@ -195,16 +193,14 @@ def pretrain(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    optimizer = create_default_optimizer(model.ops)
 | 
					    optimizer = create_default_optimizer(model.ops)
 | 
				
			||||||
    tracker = ProgressTracker(frequency=10000)
 | 
					    tracker = ProgressTracker(frequency=10000)
 | 
				
			||||||
    msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start)
 | 
					    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}")
 | 
				
			||||||
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
 | 
					    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
 | 
				
			||||||
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
 | 
					    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _save_model(epoch, is_temp=False):
 | 
					    def _save_model(epoch, is_temp=False):
 | 
				
			||||||
        is_temp_str = ".temp" if is_temp else ""
 | 
					        is_temp_str = ".temp" if is_temp else ""
 | 
				
			||||||
        with model.use_params(optimizer.averages):
 | 
					        with model.use_params(optimizer.averages):
 | 
				
			||||||
            with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
 | 
					            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
 | 
				
			||||||
                "wb"
 | 
					 | 
				
			||||||
            ) as file_:
 | 
					 | 
				
			||||||
                file_.write(model.tok2vec.to_bytes())
 | 
					                file_.write(model.tok2vec.to_bytes())
 | 
				
			||||||
            log = {
 | 
					            log = {
 | 
				
			||||||
                "nr_word": tracker.nr_word,
 | 
					                "nr_word": tracker.nr_word,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -375,7 +375,7 @@ def train(
 | 
				
			||||||
                    words_seen += sum(len(doc) for doc in docs)
 | 
					                    words_seen += sum(len(doc) for doc in docs)
 | 
				
			||||||
            with nlp.use_params(optimizer.averages):
 | 
					            with nlp.use_params(optimizer.averages):
 | 
				
			||||||
                util.set_env_log(False)
 | 
					                util.set_env_log(False)
 | 
				
			||||||
                epoch_model_path = output_path / ("model%d" % i)
 | 
					                epoch_model_path = output_path / f"model{i}"
 | 
				
			||||||
                nlp.to_disk(epoch_model_path)
 | 
					                nlp.to_disk(epoch_model_path)
 | 
				
			||||||
                nlp_loaded = util.load_model_from_path(epoch_model_path)
 | 
					                nlp_loaded = util.load_model_from_path(epoch_model_path)
 | 
				
			||||||
                for beam_width in eval_beam_widths:
 | 
					                for beam_width in eval_beam_widths:
 | 
				
			||||||
| 
						 | 
					@ -414,13 +414,13 @@ def train(
 | 
				
			||||||
                            scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
 | 
					                            scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
 | 
				
			||||||
                            end_time = timer()
 | 
					                            end_time = timer()
 | 
				
			||||||
                            cpu_wps = nwords / (end_time - start_time)
 | 
					                            cpu_wps = nwords / (end_time - start_time)
 | 
				
			||||||
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
 | 
					                    acc_loc = output_path / f"model{i}" / "accuracy.json"
 | 
				
			||||||
                    srsly.write_json(acc_loc, scorer.scores)
 | 
					                    srsly.write_json(acc_loc, scorer.scores)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    # Update model meta.json
 | 
					                    # Update model meta.json
 | 
				
			||||||
                    meta["lang"] = nlp.lang
 | 
					                    meta["lang"] = nlp.lang
 | 
				
			||||||
                    meta["pipeline"] = nlp.pipe_names
 | 
					                    meta["pipeline"] = nlp.pipe_names
 | 
				
			||||||
                    meta["spacy_version"] = ">=%s" % about.__version__
 | 
					                    meta["spacy_version"] = f">={about.__version__}"
 | 
				
			||||||
                    if beam_width == 1:
 | 
					                    if beam_width == 1:
 | 
				
			||||||
                        meta["speed"] = {
 | 
					                        meta["speed"] = {
 | 
				
			||||||
                            "nwords": nwords,
 | 
					                            "nwords": nwords,
 | 
				
			||||||
| 
						 | 
					@ -443,10 +443,10 @@ def train(
 | 
				
			||||||
                        "keys": nlp.vocab.vectors.n_keys,
 | 
					                        "keys": nlp.vocab.vectors.n_keys,
 | 
				
			||||||
                        "name": nlp.vocab.vectors.name,
 | 
					                        "name": nlp.vocab.vectors.name,
 | 
				
			||||||
                    }
 | 
					                    }
 | 
				
			||||||
                    meta.setdefault("name", "model%d" % i)
 | 
					                    meta.setdefault("name", f"model{i}")
 | 
				
			||||||
                    meta.setdefault("version", version)
 | 
					                    meta.setdefault("version", version)
 | 
				
			||||||
                    meta["labels"] = nlp.meta["labels"]
 | 
					                    meta["labels"] = nlp.meta["labels"]
 | 
				
			||||||
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
 | 
					                    meta_loc = output_path / f"model{i}" / "meta.json"
 | 
				
			||||||
                    srsly.write_json(meta_loc, meta)
 | 
					                    srsly.write_json(meta_loc, meta)
 | 
				
			||||||
                    util.set_env_log(verbose)
 | 
					                    util.set_env_log(verbose)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -615,7 +615,7 @@ def _consume_ent(tags):
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        start = "B-" + label
 | 
					        start = "B-" + label
 | 
				
			||||||
        end = "L-" + label
 | 
					        end = "L-" + label
 | 
				
			||||||
        middle = ["I-%s" % label for _ in range(1, length - 1)]
 | 
					        middle = [f"I-{label}" for _ in range(1, length - 1)]
 | 
				
			||||||
        return [start] + middle + [end]
 | 
					        return [start] + middle + [end]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1204,12 +1204,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
 | 
				
			||||||
        # Only interested if the tokenization is correct
 | 
					        # Only interested if the tokenization is correct
 | 
				
			||||||
        if start_token is not None and end_token is not None:
 | 
					        if start_token is not None and end_token is not None:
 | 
				
			||||||
            if start_token == end_token:
 | 
					            if start_token == end_token:
 | 
				
			||||||
                biluo[start_token] = "U-%s" % label
 | 
					                biluo[start_token] = f"U-{label}"
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                biluo[start_token] = "B-%s" % label
 | 
					                biluo[start_token] = f"B-{label}"
 | 
				
			||||||
                for i in range(start_token+1, end_token):
 | 
					                for i in range(start_token+1, end_token):
 | 
				
			||||||
                    biluo[i] = "I-%s" % label
 | 
					                    biluo[i] = f"I-{label}"
 | 
				
			||||||
                biluo[end_token] = "L-%s" % label
 | 
					                biluo[end_token] = f"L-{label}"
 | 
				
			||||||
    # Now distinguish the O cases from ones where we miss the tokenization
 | 
					    # Now distinguish the O cases from ones where we miss the tokenization
 | 
				
			||||||
    entity_chars = set()
 | 
					    entity_chars = set()
 | 
				
			||||||
    for start_char, end_char, label in entities:
 | 
					    for start_char, end_char, label in entities:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -442,7 +442,7 @@ cdef class KnowledgeBase:
 | 
				
			||||||
cdef class Writer:
 | 
					cdef class Writer:
 | 
				
			||||||
    def __init__(self, object loc):
 | 
					    def __init__(self, object loc):
 | 
				
			||||||
        if path.exists(loc):
 | 
					        if path.exists(loc):
 | 
				
			||||||
            assert not path.isdir(loc), "%s is directory." % loc
 | 
					            assert not path.isdir(loc), f"{loc} is directory"
 | 
				
			||||||
        if isinstance(loc, Path):
 | 
					        if isinstance(loc, Path):
 | 
				
			||||||
            loc = bytes(loc)
 | 
					            loc = bytes(loc)
 | 
				
			||||||
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
 | 
					        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -30,9 +30,9 @@ _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for h in range(1, 12 + 1):
 | 
					for h in range(1, 12 + 1):
 | 
				
			||||||
    for period in ["a.m.", "am"]:
 | 
					    for period in ["a.m.", "am"]:
 | 
				
			||||||
        _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "a.m."}]
 | 
					        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}]
 | 
				
			||||||
    for period in ["p.m.", "pm"]:
 | 
					    for period in ["p.m.", "pm"]:
 | 
				
			||||||
        _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "p.m."}]
 | 
					        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = _exc
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -559,7 +559,7 @@ for exc_data in [
 | 
				
			||||||
# Dates
 | 
					# Dates
 | 
				
			||||||
for h in range(1, 31 + 1):
 | 
					for h in range(1, 31 + 1):
 | 
				
			||||||
    for period in ["."]:
 | 
					    for period in ["."]:
 | 
				
			||||||
        _exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}]
 | 
					        _exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]}
 | 
					_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]}
 | 
				
			||||||
_exc.update(_custom_base_exc)
 | 
					_exc.update(_custom_base_exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -131,14 +131,14 @@ _exc.update(_other_exc)
 | 
				
			||||||
for h in range(1, 12 + 1):
 | 
					for h in range(1, 12 + 1):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for period in ["π.μ.", "πμ"]:
 | 
					    for period in ["π.μ.", "πμ"]:
 | 
				
			||||||
        _exc["%d%s" % (h, period)] = [
 | 
					        _exc[f"{h}{period}"] = [
 | 
				
			||||||
            {ORTH: "%d" % h},
 | 
					            {ORTH: f"{h}"},
 | 
				
			||||||
            {ORTH: period, LEMMA: "π.μ.", NORM: "π.μ."},
 | 
					            {ORTH: period, LEMMA: "π.μ.", NORM: "π.μ."},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for period in ["μ.μ.", "μμ"]:
 | 
					    for period in ["μ.μ.", "μμ"]:
 | 
				
			||||||
        _exc["%d%s" % (h, period)] = [
 | 
					        _exc[f"{h}{period}"] = [
 | 
				
			||||||
            {ORTH: "%d" % h},
 | 
					            {ORTH: f"{h}"},
 | 
				
			||||||
            {ORTH: period, LEMMA: "μ.μ.", NORM: "μ.μ."},
 | 
					            {ORTH: period, LEMMA: "μ.μ.", NORM: "μ.μ."},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -328,13 +328,13 @@ for exc_data in [
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for h in range(1, 12 + 1):
 | 
					for h in range(1, 12 + 1):
 | 
				
			||||||
    for period in ["a.m.", "am"]:
 | 
					    for period in ["a.m.", "am"]:
 | 
				
			||||||
        _exc["%d%s" % (h, period)] = [
 | 
					        _exc[f"{h}{period}"] = [
 | 
				
			||||||
            {ORTH: "%d" % h},
 | 
					            {ORTH: f"{h}"},
 | 
				
			||||||
            {ORTH: period, LEMMA: "a.m.", NORM: "a.m."},
 | 
					            {ORTH: period, LEMMA: "a.m.", NORM: "a.m."},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
    for period in ["p.m.", "pm"]:
 | 
					    for period in ["p.m.", "pm"]:
 | 
				
			||||||
        _exc["%d%s" % (h, period)] = [
 | 
					        _exc[f"{h}{period}"] = [
 | 
				
			||||||
            {ORTH: "%d" % h},
 | 
					            {ORTH: f"{h}"},
 | 
				
			||||||
            {ORTH: period, LEMMA: "p.m.", NORM: "p.m."},
 | 
					            {ORTH: period, LEMMA: "p.m.", NORM: "p.m."},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -28,9 +28,9 @@ _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for h in range(1, 12 + 1):
 | 
					for h in range(1, 12 + 1):
 | 
				
			||||||
    for period in ["a.m.", "am"]:
 | 
					    for period in ["a.m.", "am"]:
 | 
				
			||||||
        _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "a.m."}]
 | 
					        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}]
 | 
				
			||||||
    for period in ["p.m.", "pm"]:
 | 
					    for period in ["p.m.", "pm"]:
 | 
				
			||||||
        _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "p.m."}]
 | 
					        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for orth in [
 | 
					for orth in [
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -85,7 +85,7 @@ for verb, verb_lemma in [("est", "être")]:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
 | 
					for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
 | 
				
			||||||
    for orth in [pre, pre.title()]:
 | 
					    for orth in [pre, pre.title()]:
 | 
				
			||||||
        _exc["%sest-ce" % orth] = [
 | 
					        _exc[f"{orth}est-ce"] = [
 | 
				
			||||||
            {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
 | 
					            {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
 | 
				
			||||||
            {LEMMA: "être", ORTH: "est", TAG: "VERB"},
 | 
					            {LEMMA: "être", ORTH: "est", TAG: "VERB"},
 | 
				
			||||||
            {LEMMA: "ce", ORTH: "-ce"},
 | 
					            {LEMMA: "ce", ORTH: "-ce"},
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1015,7 +1015,7 @@ def _fix_pretrained_vectors_name(nlp):
 | 
				
			||||||
    elif not nlp.vocab.vectors.size:
 | 
					    elif not nlp.vocab.vectors.size:
 | 
				
			||||||
        nlp.vocab.vectors.name = None
 | 
					        nlp.vocab.vectors.name = None
 | 
				
			||||||
    elif "name" in nlp.meta and "lang" in nlp.meta:
 | 
					    elif "name" in nlp.meta and "lang" in nlp.meta:
 | 
				
			||||||
        vectors_name = "%s_%s.vectors" % (nlp.meta["lang"], nlp.meta["name"])
 | 
					        vectors_name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
 | 
				
			||||||
        nlp.vocab.vectors.name = vectors_name
 | 
					        nlp.vocab.vectors.name = vectors_name
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        raise ValueError(Errors.E092)
 | 
					        raise ValueError(Errors.E092)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -72,7 +72,7 @@ def _normalize_props(props):
 | 
				
			||||||
            # just take the first one :(
 | 
					            # just take the first one :(
 | 
				
			||||||
            if "|" in value:
 | 
					            if "|" in value:
 | 
				
			||||||
                value = value.split("|")[0]
 | 
					                value = value.split("|")[0]
 | 
				
			||||||
            attr = '%s_%s' % (key, value)
 | 
					            attr = f"{key}_{value}"
 | 
				
			||||||
            if attr in FEATURES:
 | 
					            if attr in FEATURES:
 | 
				
			||||||
                props.pop(key)
 | 
					                props.pop(key)
 | 
				
			||||||
                props[attr] = True
 | 
					                props[attr] = True
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -985,14 +985,14 @@ class MultitaskObjective(Tagger):
 | 
				
			||||||
        offset = token_annotation.heads[i] - i
 | 
					        offset = token_annotation.heads[i] - i
 | 
				
			||||||
        offset = min(offset, 2)
 | 
					        offset = min(offset, 2)
 | 
				
			||||||
        offset = max(offset, -2)
 | 
					        offset = max(offset, -2)
 | 
				
			||||||
        return "%s-%s:%d" % (token_annotation.deps[i], token_annotation.tags[i], offset)
 | 
					        return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def make_ent_tag(i, token_annotation):
 | 
					    def make_ent_tag(i, token_annotation):
 | 
				
			||||||
        if token_annotation.entities is None or token_annotation.entities[i] is None:
 | 
					        if token_annotation.entities is None or token_annotation.entities[i] is None:
 | 
				
			||||||
            return None
 | 
					            return None
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            return "%s-%s" % (token_annotation.tags[i], token_annotation.entities[i])
 | 
					            return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def make_sent_start(target, token_annotation, cache=True, _cache={}):
 | 
					    def make_sent_start(target, token_annotation, cache=True, _cache={}):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -154,8 +154,7 @@ def _decorate(heads, proj_heads, labels):
 | 
				
			||||||
    deco_labels = []
 | 
					    deco_labels = []
 | 
				
			||||||
    for tokenid, head in enumerate(heads):
 | 
					    for tokenid, head in enumerate(heads):
 | 
				
			||||||
        if head != proj_heads[tokenid]:
 | 
					        if head != proj_heads[tokenid]:
 | 
				
			||||||
            deco_labels.append(
 | 
					            deco_labels.append(f"{labels[tokenid]}{DELIMITER}{labels[head]}")
 | 
				
			||||||
                '%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
 | 
					 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            deco_labels.append(labels[tokenid])
 | 
					            deco_labels.append(labels[tokenid])
 | 
				
			||||||
    return deco_labels
 | 
					    return deco_labels
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -46,9 +46,9 @@ cdef class StateClass:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def print_state(self, words):
 | 
					    def print_state(self, words):
 | 
				
			||||||
        words = list(words) + ['_']
 | 
					        words = list(words) + ['_']
 | 
				
			||||||
        top = words[self.S(0)] + '_%d' % self.S_(0).head
 | 
					        top = f"{words[self.S(0)]}_{self.S_(0).head}"
 | 
				
			||||||
        second = words[self.S(1)] + '_%d' % self.S_(1).head
 | 
					        second = f"{words[self.S(1)]}_{self.S_(1).head}"
 | 
				
			||||||
        third = words[self.S(2)] + '_%d' % self.S_(2).head
 | 
					        third = f"{words[self.S(2)]}_{self.S_(2).head}"
 | 
				
			||||||
        n0 = words[self.B(0)]
 | 
					        n0 = words[self.B(0)]
 | 
				
			||||||
        n1 = words[self.B(1)]
 | 
					        n1 = words[self.B(1)]
 | 
				
			||||||
        return ' '.join((third, second, top, '|', n0, n1))
 | 
					        return ' '.join((third, second, top, '|', n0, n1))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,11 +14,11 @@ def pytest_runtest_setup(item):
 | 
				
			||||||
        # recognize the option we're asking about. To avoid this, we need to
 | 
					        # recognize the option we're asking about. To avoid this, we need to
 | 
				
			||||||
        # pass a default value. We default to False, i.e., we act like all the
 | 
					        # pass a default value. We default to False, i.e., we act like all the
 | 
				
			||||||
        # options weren't given.
 | 
					        # options weren't given.
 | 
				
			||||||
        return item.config.getoption("--%s" % opt, False)
 | 
					        return item.config.getoption(f"--{opt}", False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for opt in ["slow"]:
 | 
					    for opt in ["slow"]:
 | 
				
			||||||
        if opt in item.keywords and not getopt(opt):
 | 
					        if opt in item.keywords and not getopt(opt):
 | 
				
			||||||
            pytest.skip("need --%s option to run" % opt)
 | 
					            pytest.skip(f"need --{opt} option to run")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Fixtures for language tokenizers (languages sorted alphabetically)
 | 
					# Fixtures for language tokenizers (languages sorted alphabetically)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -115,7 +115,7 @@ def test_disable_pipes_list_arg(nlp):
 | 
				
			||||||
@pytest.mark.parametrize("n_pipes", [100])
 | 
					@pytest.mark.parametrize("n_pipes", [100])
 | 
				
			||||||
def test_add_lots_of_pipes(nlp, n_pipes):
 | 
					def test_add_lots_of_pipes(nlp, n_pipes):
 | 
				
			||||||
    for i in range(n_pipes):
 | 
					    for i in range(n_pipes):
 | 
				
			||||||
        nlp.add_pipe(lambda doc: doc, name="pipe_%d" % i)
 | 
					        nlp.add_pipe(lambda doc: doc, name=f"pipe_{i}")
 | 
				
			||||||
    assert len(nlp.pipe_names) == n_pipes
 | 
					    assert len(nlp.pipe_names) == n_pipes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -499,7 +499,7 @@ cdef class Doc:
 | 
				
			||||||
                token = &self.c[i]
 | 
					                token = &self.c[i]
 | 
				
			||||||
                if token.ent_iob == 1:
 | 
					                if token.ent_iob == 1:
 | 
				
			||||||
                    if start == -1:
 | 
					                    if start == -1:
 | 
				
			||||||
                        seq = ["%s|%s" % (t.text, t.ent_iob_) for t in self[i-5:i+5]]
 | 
					                        seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]]
 | 
				
			||||||
                        raise ValueError(Errors.E093.format(seq=" ".join(seq)))
 | 
					                        raise ValueError(Errors.E093.format(seq=" ".join(seq)))
 | 
				
			||||||
                elif token.ent_iob == 2 or token.ent_iob == 0:
 | 
					                elif token.ent_iob == 2 or token.ent_iob == 0:
 | 
				
			||||||
                    if start != -1:
 | 
					                    if start != -1:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -62,7 +62,7 @@ def get_lang_class(lang):
 | 
				
			||||||
        return registry.languages.get(lang)
 | 
					        return registry.languages.get(lang)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            module = importlib.import_module(".lang.%s" % lang, "spacy")
 | 
					            module = importlib.import_module(f".lang.{lang}", "spacy")
 | 
				
			||||||
        except ImportError as err:
 | 
					        except ImportError as err:
 | 
				
			||||||
            raise ImportError(Errors.E048.format(lang=lang, err=err))
 | 
					            raise ImportError(Errors.E048.format(lang=lang, err=err))
 | 
				
			||||||
        set_lang_class(lang, getattr(module, module.__all__[0]))
 | 
					        set_lang_class(lang, getattr(module, module.__all__[0]))
 | 
				
			||||||
| 
						 | 
					@ -212,7 +212,7 @@ def load_model_from_init_py(init_file, **overrides):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    model_path = Path(init_file).parent
 | 
					    model_path = Path(init_file).parent
 | 
				
			||||||
    meta = get_model_meta(model_path)
 | 
					    meta = get_model_meta(model_path)
 | 
				
			||||||
    data_dir = "%s_%s-%s" % (meta["lang"], meta["name"], meta["version"])
 | 
					    data_dir = f"{meta['lang']}_{meta['name']}-{meta['version']}"
 | 
				
			||||||
    data_path = model_path / data_dir
 | 
					    data_path = model_path / data_dir
 | 
				
			||||||
    if not model_path.exists():
 | 
					    if not model_path.exists():
 | 
				
			||||||
        raise IOError(Errors.E052.format(path=data_path))
 | 
					        raise IOError(Errors.E052.format(path=data_path))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user