From bf92625edead3707038fbd00e0b249ebe1f04855 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 26 Apr 2019 13:19:50 +0200 Subject: [PATCH] Update from master --- .github/contributors/bjascob.md | 106 ++++++++++++++++++++++++++++ spacy/cli/evaluate.py | 2 +- spacy/cli/pretrain.py | 43 +++++++----- spacy/cli/train.py | 45 +++++++----- spacy/displacy/__init__.py | 7 +- spacy/lang/th/__init__.py | 10 ++- spacy/lang/th/norm_exceptions.py | 114 +++++++++++++++++++++++++++++++ website/meta/universe.json | 22 ++++++ 8 files changed, 310 insertions(+), 39 deletions(-) create mode 100644 .github/contributors/bjascob.md create mode 100644 spacy/lang/th/norm_exceptions.py diff --git a/.github/contributors/bjascob.md b/.github/contributors/bjascob.md new file mode 100644 index 000000000..4870c494a --- /dev/null +++ b/.github/contributors/bjascob.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Brad Jascob | +| Company name (if applicable) | n/a | +| Title or role (if applicable) | Software Engineer | +| Date | 04/25/2019 | +| GitHub username | bjascob | +| Website (optional) | n/a | diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index df391d730..468698e2f 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -17,7 +17,7 @@ from .. import displacy gpu_id=("Use GPU", "option", "g", int), displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str), displacy_limit=("Limit of parses to render as HTML", "option", "dl", int), - return_scores=("Return dict containing model scores", "flag", "r", bool), + return_scores=("Return dict containing model scores", "flag", "R", bool), ) def evaluate( model, diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 0b316b47c..ef91937a6 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -34,7 +34,8 @@ from .. import util max_length=("Max words per example.", "option", "xw", int), min_length=("Min words per example.", "option", "nw", int), seed=("Seed for random number generators", "option", "s", float), - nr_iter=("Number of iterations to pretrain", "option", "i", int), + n_iter=("Number of iterations to pretrain", "option", "i", int), + n_save_every=("Save model every X batches.", "option", "se", int), ) def pretrain( texts_loc, @@ -46,11 +47,12 @@ def pretrain( loss_func="cosine", use_vectors=False, dropout=0.2, - nr_iter=1000, + n_iter=1000, batch_size=3000, max_length=500, min_length=5, seed=0, + n_save_every=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, @@ -115,9 +117,26 @@ def pretrain( msg.divider("Pre-training tok2vec layer") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) - for epoch in range(nr_iter): - for batch in util.minibatch_by_words( - ((text, None) for text in texts), size=batch_size + + def _save_model(epoch, is_temp=False): + is_temp_str = ".temp" if is_temp else "" + with model.use_params(optimizer.averages): + with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open( + "wb" + ) as file_: + file_.write(model.tok2vec.to_bytes()) + log = { + "nr_word": tracker.nr_word, + "loss": tracker.loss, + "epoch_loss": tracker.epoch_loss, + "epoch": epoch, + } + with (output_dir / "log.jsonl").open("a") as file_: + file_.write(srsly.json_dumps(log) + "\n") + + for epoch in range(n_iter): + for batch_id, batch in enumerate( + util.minibatch_by_words(((text, None) for text in texts), size=batch_size) ): docs = make_docs( nlp, @@ -133,17 +152,9 @@ def pretrain( msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: break - with model.use_params(optimizer.averages): - with (output_dir / ("model%d.bin" % epoch)).open("wb") as file_: - file_.write(model.tok2vec.to_bytes()) - log = { - "nr_word": tracker.nr_word, - "loss": tracker.loss, - "epoch_loss": tracker.epoch_loss, - "epoch": epoch, - } - with (output_dir / "log.jsonl").open("a") as file_: - file_.write(srsly.json_dumps(log) + "\n") + if n_save_every and (batch_id % n_save_every == 0): + _save_model(epoch, is_temp=True) + _save_model(epoch) tracker.epoch_loss = 0.0 if texts_loc != "-": # Reshuffle the texts if texts were loaded from a file diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 5cf0f5f6f..63c6242de 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -35,7 +35,12 @@ from .. import about pipeline=("Comma-separated names of pipeline components", "option", "p", str), vectors=("Model to load vectors from", "option", "v", str), n_iter=("Number of iterations", "option", "n", int), - early_stopping_iter=("Maximum number of training epochs without dev accuracy improvement", "option", "e", int), + n_early_stopping=( + "Maximum number of training epochs without dev accuracy improvement", + "option", + "ne", + int, + ), n_examples=("Number of examples", "option", "ns", int), use_gpu=("Use GPU", "option", "g", int), version=("Model version", "option", "V", str), @@ -75,7 +80,7 @@ def train( pipeline="tagger,parser,ner", vectors=None, n_iter=30, - early_stopping_iter=None, + n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", @@ -226,7 +231,7 @@ def train( msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 - best_score = 0. + best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0 @@ -335,17 +340,23 @@ def train( gpu_wps=gpu_wps, ) msg.row(progress, **row_settings) - # early stopping - if early_stopping_iter is not None: + # Early stopping + if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score - if iter_since_best >= early_stopping_iter: - msg.text("Early stopping, best iteration is: {}".format(i-iter_since_best)) - msg.text("Best score = {}; Final iteration score = {}".format(best_score, current_score)) + if iter_since_best >= n_early_stopping: + msg.text( + "Early stopping, best iteration " + "is: {}".format(i - iter_since_best) + ) + msg.text( + "Best score = {}; Final iteration " + "score = {}".format(best_score, current_score) + ) break finally: with nlp.use_params(optimizer.averages): @@ -356,19 +367,21 @@ def train( best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) msg.good("Created best model", best_model_path) + def _score_for_model(meta): """ Returns mean score between tasks in pipeline that can be used for early stopping. """ mean_acc = list() - pipes = meta['pipeline'] - acc = meta['accuracy'] - if 'tagger' in pipes: - mean_acc.append(acc['tags_acc']) - if 'parser' in pipes: - mean_acc.append((acc['uas']+acc['las']) / 2) - if 'ner' in pipes: - mean_acc.append((acc['ents_p']+acc['ents_r']+acc['ents_f']) / 3) + pipes = meta["pipeline"] + acc = meta["accuracy"] + if "tagger" in pipes: + mean_acc.append(acc["tags_acc"]) + if "parser" in pipes: + mean_acc.append((acc["uas"] + acc["las"]) / 2) + if "ner" in pipes: + mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3) return sum(mean_acc) / len(mean_acc) + @contextlib.contextmanager def _create_progress_bar(total): if int(os.environ.get("LOG_FRIENDLY", 0)): diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index fadbaaa7e..b651c0996 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -19,7 +19,7 @@ RENDER_WRAPPER = None def render( - docs, style="dep", page=False, minify=False, jupyter=False, options={}, manual=False + docs, style="dep", page=False, minify=False, jupyter=None, options={}, manual=False ): """Render displaCy visualisation. @@ -27,7 +27,7 @@ def render( style (unicode): Visualisation style, 'dep' or 'ent'. page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. - jupyter (bool): Experimental, use Jupyter's `display()` to output markup. + jupyter (bool): Override Jupyter auto-detection. options (dict): Visualiser-specific options, e.g. colors. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. RETURNS (unicode): Rendered HTML markup. @@ -53,7 +53,8 @@ def render( html = _html["parsed"] if RENDER_WRAPPER is not None: html = RENDER_WRAPPER(html) - if jupyter or is_in_jupyter(): # return HTML rendered by IPython display() + if jupyter or (jupyter is None and is_in_jupyter()): + # return HTML rendered by IPython display() from IPython.core.display import display, HTML return display(HTML(html)) diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 0bd8333db..ba5b86d77 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -4,11 +4,13 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS +from .norm_exceptions import NORM_EXCEPTIONS -from ...attrs import LANG +from ..norm_exceptions import BASE_NORMS +from ...attrs import LANG, NORM from ...language import Language from ...tokens import Doc -from ...util import DummyTokenizer +from ...util import DummyTokenizer, add_lookups class ThaiTokenizer(DummyTokenizer): @@ -33,7 +35,9 @@ class ThaiTokenizer(DummyTokenizer): class ThaiDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda _text: "th" - + lex_attr_getters[NORM] = add_lookups( + Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS + ) tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/lang/th/norm_exceptions.py b/spacy/lang/th/norm_exceptions.py new file mode 100644 index 000000000..497779cf9 --- /dev/null +++ b/spacy/lang/th/norm_exceptions.py @@ -0,0 +1,114 @@ +# coding: utf8 +from __future__ import unicode_literals + + +_exc = { + # Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์) + "สนุ๊กเกอร์": "สนุกเกอร์", + "โน้ต": "โน้ต", + # Misspelled because of being lazy or hustle (สะกดผิดเพราะขี้เกียจพิมพ์ หรือเร่งรีบ) + "โทสับ": "โทรศัพท์", + "พุ่งนี้": "พรุ่งนี้", + # Strange (ให้ดูแปลกตา) + "ชะมะ": "ใช่ไหม", + "ชิมิ": "ใช่ไหม", + "ชะ": "ใช่ไหม", + "ช่ายมะ": "ใช่ไหม", + "ป่าว": "เปล่า", + "ป่ะ": "เปล่า", + "ปล่าว": "เปล่า", + "คัย": "ใคร", + "ไค": "ใคร", + "คราย": "ใคร", + "เตง": "ตัวเอง", + "ตะเอง": "ตัวเอง", + "รึ": "หรือ", + "เหรอ": "หรือ", + "หรา": "หรือ", + "หรอ": "หรือ", + "ชั้น": "ฉัน", + "ชั้ล": "ฉัน", + "ช้าน": "ฉัน", + "เทอ": "เธอ", + "เทอร์": "เธอ", + "เทอว์": "เธอ", + "แกร": "แก", + "ป๋ม": "ผม", + "บ่องตง": "บอกตรงๆ", + "ถ่ามตง": "ถามตรงๆ", + "ต่อมตง": "ตอบตรงๆ", + "เพิ่ล": "เพื่อน", + "จอบอ": "จอบอ", + "ดั้ย": "ได้", + "ขอบคุง": "ขอบคุณ", + "ยังงัย": "ยังไง", + "Inw": "เทพ", + "uou": "นอน", + "Lกรีeu": "เกรียน", + # Misspelled to express emotions (คำที่สะกดผิดเพื่อแสดงอารมณ์) + "เปงราย": "เป็นอะไร", + "เปนรัย": "เป็นอะไร", + "เปงรัย": "เป็นอะไร", + "เป็นอัลไล": "เป็นอะไร", + "ทามมาย": "ทำไม", + "ทามมัย": "ทำไม", + "จังรุย": "จังเลย", + "จังเยย": "จังเลย", + "จุงเบย": "จังเลย", + "ไม่รู้": "มะรุ", + "เฮ่ย": "เฮ้ย", + "เห้ย": "เฮ้ย", + "น่าร็อค": "น่ารัก", + "น่าร๊าก": "น่ารัก", + "ตั้ลล๊าก": "น่ารัก", + "คือร๊ะ": "คืออะไร", + "โอป่ะ": "โอเคหรือเปล่า", + "น่ามคาน": "น่ารำคาญ", + "น่ามสาร": "น่าสงสาร", + "วงวาร": "สงสาร", + "บับว่า": "แบบว่า", + "อัลไล": "อะไร", + "อิจ": "อิจฉา", + # Reduce rough words or Avoid to software filter (คำที่สะกดผิดเพื่อลดความหยาบของคำ หรืออาจใช้หลีกเลี่ยงการกรองคำหยาบของซอฟต์แวร์) + "กรู": "กู", + "กุ": "กู", + "กรุ": "กู", + "ตู": "กู", + "ตรู": "กู", + "มรึง": "มึง", + "เมิง": "มึง", + "มืง": "มึง", + "มุง": "มึง", + "สาด": "สัตว์", + "สัส": "สัตว์", + "สัก": "สัตว์", + "แสรด": "สัตว์", + "โคโตะ": "โคตร", + "โคด": "โคตร", + "โครต": "โคตร", + "โคตะระ": "โคตร", + "พ่อง": "พ่อมึง", + "แม่เมิง": "แม่มึง", + "เชี่ย": "เหี้ย", + # Imitate words (คำเลียนเสียง โดยส่วนใหญ่จะเพิ่มทัณฑฆาต หรือซ้ำตัวอักษร) + "แอร๊ยย": "อ๊าย", + "อร๊ายยย": "อ๊าย", + "มันส์": "มัน", + "วู๊วววววววว์": "วู้", + # Acronym (แบบคำย่อ) + "หมาลัย": "มหาวิทยาลัย", + "วิดวะ": "วิศวะ", + "สินสาด ": "ศิลปศาสตร์", + "สินกำ ": "ศิลปกรรมศาสตร์", + "เสารีย์ ": "อนุเสาวรีย์ชัยสมรภูมิ", + "เมกา ": "อเมริกา", + "มอไซค์ ": "มอเตอร์ไซค์", +} + + +NORM_EXCEPTIONS = {} + +for string, norm in _exc.items(): + NORM_EXCEPTIONS[string] = norm + NORM_EXCEPTIONS[string.title()] = norm + diff --git a/website/meta/universe.json b/website/meta/universe.json index 29e050964..a6a8bf247 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1316,6 +1316,28 @@ "author_links": { "github": "oterrier" } + }, + { + "id": "pyInflect", + "slogan": "A python module for word inflections", + "description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add word inflections to the system.", + "github": "bjascob/pyInflect", + "pip": "pyinflect", + "code_example": [ + "import spacy", + "import pyinflect", + "", + "nlp = spacy.load('en_core_web_sm')", + "doc = nlp('This is an example.')", + "doc[3].tag_ # NN", + "doc[3]._.inflect('NNS') # examples" + ], + "author": "Brad Jascob", + "author_links": { + "github": "bjascob" + }, + "category": ["pipeline"], + "tags": ["inflection"] } ], "categories": [