mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'master' into spacy.io
This commit is contained in:
		
						commit
						ac5990f793
					
				| 
						 | 
					@ -16,4 +16,4 @@ version=${version/\'/}
 | 
				
			||||||
version=${version/\"/}
 | 
					version=${version/\"/}
 | 
				
			||||||
version=${version/\"/}
 | 
					version=${version/\"/}
 | 
				
			||||||
git tag "v$version"
 | 
					git tag "v$version"
 | 
				
			||||||
git push origin "v$version" --tags
 | 
					git push origin "v$version"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,7 +4,7 @@
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__title__ = "spacy"
 | 
					__title__ = "spacy"
 | 
				
			||||||
__version__ = "2.1.4.dev0"
 | 
					__version__ = "2.1.4"
 | 
				
			||||||
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 | 
					__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 | 
				
			||||||
__uri__ = "https://spacy.io"
 | 
					__uri__ = "https://spacy.io"
 | 
				
			||||||
__author__ = "Explosion AI"
 | 
					__author__ = "Explosion AI"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -11,14 +11,8 @@ def iob2json(input_data, n_sents=10, *args, **kwargs):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Convert IOB files into JSON format for use with train cli.
 | 
					    Convert IOB files into JSON format for use with train cli.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    docs = []
 | 
					    sentences = read_iob(input_data.split("\n"))
 | 
				
			||||||
    for group in minibatch(docs, n_sents):
 | 
					    docs = merge_sentences(sentences, n_sents)
 | 
				
			||||||
        group = list(group)
 | 
					 | 
				
			||||||
        first = group.pop(0)
 | 
					 | 
				
			||||||
        to_extend = first["paragraphs"][0]["sentences"]
 | 
					 | 
				
			||||||
        for sent in group[1:]:
 | 
					 | 
				
			||||||
            to_extend.extend(sent["paragraphs"][0]["sentences"])
 | 
					 | 
				
			||||||
        docs.append(first)
 | 
					 | 
				
			||||||
    return docs
 | 
					    return docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -27,7 +21,6 @@ def read_iob(raw_sents):
 | 
				
			||||||
    for line in raw_sents:
 | 
					    for line in raw_sents:
 | 
				
			||||||
        if not line.strip():
 | 
					        if not line.strip():
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        # tokens = [t.split("|") for t in line.split()]
 | 
					 | 
				
			||||||
        tokens = [re.split("[^\w\-]", line.strip())]
 | 
					        tokens = [re.split("[^\w\-]", line.strip())]
 | 
				
			||||||
        if len(tokens[0]) == 3:
 | 
					        if len(tokens[0]) == 3:
 | 
				
			||||||
            words, pos, iob = zip(*tokens)
 | 
					            words, pos, iob = zip(*tokens)
 | 
				
			||||||
| 
						 | 
					@ -49,3 +42,15 @@ def read_iob(raw_sents):
 | 
				
			||||||
    paragraphs = [{"sentences": [sent]} for sent in sentences]
 | 
					    paragraphs = [{"sentences": [sent]} for sent in sentences]
 | 
				
			||||||
    docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs]
 | 
					    docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs]
 | 
				
			||||||
    return docs
 | 
					    return docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def merge_sentences(docs, n_sents):
 | 
				
			||||||
 | 
					    merged = []
 | 
				
			||||||
 | 
					    for group in minibatch(docs, size=n_sents):
 | 
				
			||||||
 | 
					        group = list(group)
 | 
				
			||||||
 | 
					        first = group.pop(0)
 | 
				
			||||||
 | 
					        to_extend = first["paragraphs"][0]["sentences"]
 | 
				
			||||||
 | 
					        for sent in group[1:]:
 | 
				
			||||||
 | 
					            to_extend.extend(sent["paragraphs"][0]["sentences"])
 | 
				
			||||||
 | 
					        merged.append(first)
 | 
				
			||||||
 | 
					    return merged
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user