Merge branch 'master' into develop

This commit is contained in:
ines 2017-03-22 17:14:57 +01:00
commit ccea10b4a9
12 changed files with 7 additions and 40 deletions

View File

@ -1,18 +1,13 @@
from __future__ import unicode_literals
import plac
import json
from os import path
import shutil
import os
import random
import io
import pathlib
from spacy.tokens import Doc
from spacy.syntax.nonproj import PseudoProjectivity
from spacy.language import Language
from spacy.gold import GoldParse
from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.pipeline import DependencyParser, BeamDependencyParser
from spacy.syntax.parser import get_templates
@ -23,7 +18,6 @@ import spacy.attrs
import io
def read_conllx(loc, n=0):
with io.open(loc, 'r', encoding='utf8') as file_:
text = file_.read()
@ -35,7 +29,8 @@ def read_conllx(loc, n=0):
lines.pop(0)
tokens = []
for line in lines:
id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
id_, word, lemma, pos, tag, morph, head, dep, _1, \
_2 = line.split('\t')
if '-' in id_ or '.' in id_:
continue
try:
@ -134,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
random.shuffle(train_sents)
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
nlp.end_training(model_dir)
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))

View File

@ -1,4 +1,4 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.stdio cimport fopen, fclose, fread, fwrite
from libc.string cimport memcpy

View File

@ -21,7 +21,6 @@ MORPH_RULES = {
"them": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"},
"mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"},
"yours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"},
"his": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"},
"hers": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"},
"its": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"},

View File

@ -193,9 +193,6 @@ TOKENIZER_EXCEPTIONS = {
"vm.": [
{ORTH: "vm.", LEMMA: "viimeksi mainittu"}
],
"siht.": [
{ORTH: "siht.", LEMMA: "sihteeri"}
],
"srk.": [
{ORTH: "srk.", LEMMA: "seurakunta"}
]

View File

@ -1,16 +1,12 @@
# cython: profile=True
from __future__ import unicode_literals, print_function
import numpy
import io
import json
import random
import re
import os
from os import path
from libc.string cimport memset
import ujson as json
from .syntax import nonproj

View File

@ -1,6 +1,5 @@
from __future__ import absolute_import
from __future__ import unicode_literals
from warnings import warn
import pathlib
from contextlib import contextmanager
import shutil
@ -33,7 +32,6 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
from .syntax.parser import get_templates
from .syntax.nonproj import PseudoProjectivity
from .pipeline import DependencyParser, EntityRecognizer
from .pipeline import BeamDependencyParser, BeamEntityRecognizer
from .syntax.arc_eager import ArcEager
from .syntax.ner import BiluoPushDown

View File

@ -2,13 +2,10 @@
# cython: infer_types=True
from __future__ import unicode_literals
from os import path
from .typedefs cimport attr_t
from .typedefs cimport hash_t
from .attrs cimport attr_id_t
from .structs cimport TokenC, LexemeC
from .lexeme cimport Lexeme
from .structs cimport TokenC
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
@ -17,7 +14,7 @@ from libcpp.pair cimport pair
from murmurhash.mrmr cimport hash64
from libc.stdint cimport int32_t
from .attrs cimport ID, LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
from .attrs cimport ID, ENT_TYPE
from . import attrs
from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc

View File

@ -1,12 +1,8 @@
# cython: infer_types
from __future__ import unicode_literals
from os import path
from libc.string cimport memset
from .lemmatizer import Lemmatizer
try:
import ujson as json
except ImportError:

View File

@ -2,7 +2,6 @@ from .syntax.parser cimport Parser
from .syntax.beam_parser cimport BeamParser
from .syntax.ner cimport BiluoPushDown
from .syntax.arc_eager cimport ArcEager
from .vocab cimport Vocab
from .tagger import Tagger
# TODO: The disorganization here is pretty embarrassing. At least it's only

View File

@ -1,20 +1,16 @@
import json
import pathlib
from collections import defaultdict
from libc.string cimport memset
from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t, weight_t
from thinc.typedefs cimport atom_t
from thinc.extra.eg cimport Example
from thinc.structs cimport ExampleC
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec
from .typedefs cimport attr_t
from .tokens.doc cimport Doc
from .attrs cimport TAG
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .gold cimport GoldParse
from .attrs cimport *

View File

@ -1,13 +1,10 @@
# cython: embedsignature=True
from __future__ import unicode_literals
import re
import pathlib
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
from cpython cimport Py_UNICODE_ISSPACE
try:
import ujson as json

View File

@ -8,11 +8,8 @@ import os.path
import pathlib
import sys
import six
import textwrap
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
try:
basestring