mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge branch 'master' into develop
This commit is contained in:
commit
ccea10b4a9
|
@ -1,18 +1,13 @@
|
|||
from __future__ import unicode_literals
|
||||
import plac
|
||||
import json
|
||||
from os import path
|
||||
import shutil
|
||||
import os
|
||||
import random
|
||||
import io
|
||||
import pathlib
|
||||
|
||||
from spacy.tokens import Doc
|
||||
from spacy.syntax.nonproj import PseudoProjectivity
|
||||
from spacy.language import Language
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.pipeline import DependencyParser, BeamDependencyParser
|
||||
from spacy.syntax.parser import get_templates
|
||||
|
@ -23,7 +18,6 @@ import spacy.attrs
|
|||
import io
|
||||
|
||||
|
||||
|
||||
def read_conllx(loc, n=0):
|
||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
||||
text = file_.read()
|
||||
|
@ -35,7 +29,8 @@ def read_conllx(loc, n=0):
|
|||
lines.pop(0)
|
||||
tokens = []
|
||||
for line in lines:
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, \
|
||||
_2 = line.split('\t')
|
||||
if '-' in id_ or '.' in id_:
|
||||
continue
|
||||
try:
|
||||
|
@ -134,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
|
|||
random.shuffle(train_sents)
|
||||
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
|
||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
|
||||
nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
|
||||
nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
|
||||
nlp.end_training(model_dir)
|
||||
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
|
||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
from libc.stdio cimport fopen, fclose, fread, fwrite
|
||||
from libc.string cimport memcpy
|
||||
|
||||
|
||||
|
|
|
@ -21,7 +21,6 @@ MORPH_RULES = {
|
|||
"them": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"},
|
||||
|
||||
"mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"},
|
||||
"yours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"},
|
||||
"his": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"},
|
||||
"hers": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"},
|
||||
"its": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"},
|
||||
|
|
|
@ -193,9 +193,6 @@ TOKENIZER_EXCEPTIONS = {
|
|||
"vm.": [
|
||||
{ORTH: "vm.", LEMMA: "viimeksi mainittu"}
|
||||
],
|
||||
"siht.": [
|
||||
{ORTH: "siht.", LEMMA: "sihteeri"}
|
||||
],
|
||||
"srk.": [
|
||||
{ORTH: "srk.", LEMMA: "seurakunta"}
|
||||
]
|
||||
|
|
|
@ -1,16 +1,12 @@
|
|||
# cython: profile=True
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import numpy
|
||||
import io
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import os
|
||||
from os import path
|
||||
|
||||
from libc.string cimport memset
|
||||
|
||||
import ujson as json
|
||||
|
||||
from .syntax import nonproj
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
from warnings import warn
|
||||
import pathlib
|
||||
from contextlib import contextmanager
|
||||
import shutil
|
||||
|
@ -33,7 +32,6 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
|
|||
from .syntax.parser import get_templates
|
||||
from .syntax.nonproj import PseudoProjectivity
|
||||
from .pipeline import DependencyParser, EntityRecognizer
|
||||
from .pipeline import BeamDependencyParser, BeamEntityRecognizer
|
||||
from .syntax.arc_eager import ArcEager
|
||||
from .syntax.ner import BiluoPushDown
|
||||
|
||||
|
|
|
@ -2,13 +2,10 @@
|
|||
# cython: infer_types=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from os import path
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
from .typedefs cimport hash_t
|
||||
from .attrs cimport attr_id_t
|
||||
from .structs cimport TokenC, LexemeC
|
||||
from .lexeme cimport Lexeme
|
||||
from .structs cimport TokenC
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
|
@ -17,7 +14,7 @@ from libcpp.pair cimport pair
|
|||
from murmurhash.mrmr cimport hash64
|
||||
from libc.stdint cimport int32_t
|
||||
|
||||
from .attrs cimport ID, LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
|
||||
from .attrs cimport ID, ENT_TYPE
|
||||
from . import attrs
|
||||
from .tokens.doc cimport get_token_attr
|
||||
from .tokens.doc cimport Doc
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
# cython: infer_types
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from os import path
|
||||
|
||||
from libc.string cimport memset
|
||||
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
|
|
|
@ -2,7 +2,6 @@ from .syntax.parser cimport Parser
|
|||
from .syntax.beam_parser cimport BeamParser
|
||||
from .syntax.ner cimport BiluoPushDown
|
||||
from .syntax.arc_eager cimport ArcEager
|
||||
from .vocab cimport Vocab
|
||||
from .tagger import Tagger
|
||||
|
||||
# TODO: The disorganization here is pretty embarrassing. At least it's only
|
||||
|
|
|
@ -1,20 +1,16 @@
|
|||
import json
|
||||
import pathlib
|
||||
from collections import defaultdict
|
||||
from libc.string cimport memset
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport atom_t, weight_t
|
||||
from thinc.typedefs cimport atom_t
|
||||
from thinc.extra.eg cimport Example
|
||||
from thinc.structs cimport ExampleC
|
||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||
from thinc.linalg cimport VecVec
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
from .tokens.doc cimport Doc
|
||||
from .attrs cimport TAG
|
||||
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
|
||||
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
||||
from .gold cimport GoldParse
|
||||
|
||||
from .attrs cimport *
|
||||
|
|
|
@ -1,13 +1,10 @@
|
|||
# cython: embedsignature=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
import pathlib
|
||||
|
||||
from cython.operator cimport dereference as deref
|
||||
from cython.operator cimport preincrement as preinc
|
||||
from cpython cimport Py_UNICODE_ISSPACE
|
||||
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
|
|
|
@ -8,11 +8,8 @@ import os.path
|
|||
import pathlib
|
||||
import sys
|
||||
|
||||
import six
|
||||
import textwrap
|
||||
|
||||
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||
|
||||
|
||||
try:
|
||||
basestring
|
||||
|
|
Loading…
Reference in New Issue
Block a user