CRF的意思是什么

Python013

CRF的意思是什么,第1张

CRF

基本翻译

abbr. 阴极射线炉(Cathode Ray Furnace);促皮质素释放因子(Corticotropin Releasing Factor);慢性肾功能衰竭

网络释义

CRF:慢性肾功能衰竭|慢性肾衰竭|促肾上腺皮质激素释放因子

CRF -:信道重复频率|成本加航运费|慢性肾功衰退

CKD-CRF:慢性肾脏病|肾脏病

"""

NLP命名体识别bilstm+crf

1、准备数据:origin_handle_entities()

读取源数据文件,把人名,地名,机构名合并起来

2、读取处理后的数据:origin_handle_mark()

把预处理后的的文本标注成BMO的格式,

B(begin)、M(middle)、E(end)、O(other)

3、句子切分:sentence_split()

按照指定的格式,比如标点等内容对数据完成切分

4、保存数据

a.将标注的句子拆分自成列表和对应的标注序列

b.创建词汇表和标签

c.文本的向量化表示

d.划分训练集和测试集

e.保存成二进制pkl文件

5、加载数据

6、训练模型BiLSTM&HMM

7、保存训练后的模型用于预测

8、预测

"""

import codecs

import re

import collections

import pickle

import TorchCRF as CRF

import numpy as np

from tensorflow.keras.preprocessing.sequence import pad_sequences #使用tensorflow的pad_sequences进行数据对齐 tensorflow2.3.1

from sklearn.model_selection import train_test_split

def origin_handle_entities():

with open('renmin.txt','r',encoding='utf-8') as inp,

open('middle/renmin2.txt','w',encoding='utf-8')

as outp:

#读取源文件中的数据

for line in inp.readlines():

#按照空格切分

line = line.split(' ')

i = 1

while i <len(line) - 1:

if line[i][0] == '[':

outp.write(line[i].split('/')[0][1:])

i += 1

while i <len(line) - 1 and line[i].find(']') == -1:

if line[i] !='':

#print(line[i].split('/')[0])

outp.write(line[i].split('/')[0])

i += 1

outp.write(line[i].split('/')[0].strip()+'/'+line[i])

elif line[i].split('/')[1] == 'nr':

word = line[i].split('/')[0]

i += 1

if i <len(line) - 1 and line[i].split('/')[1] == 'nr':

outp.write(word + line[i].split('/')[0] + 'nr')

else:

outp.write(word + '/nr ')

continue

else:

outp.write(line[i] + '/no ')

i += 1

outp.write('\n')

import codecs

def origin_handle_mark():

"""

1、读取数据预处理后的renmin2.txt

2、将标注好的数据写入renmin3.txt

a.打开输入和输出文件

b.遍历输入文件renmin2.txt

:return:

"""

with codecs.open('middle/renmin2.txt','r',encoding='utf-8') as inp,

codecs.open('middle/renmin3.txt','w',encoding='utf-8') as outp:

#########句子切分###################################

import re

def sentence_split():

with codecs.open('middel/renmin3.txt','r',encoding='utf-8') as inp,

codecs.open('middle/renmin4.txt','w',encoding='utf-8') as outp:

#文本文件的内容设置为对应的utf-8编码,python3:先encode,再decode

texts = inp.read().encode('utf-8').decode('utf-8')

#切分句子

sentences =

re.split('[,。!?、''"":]/[0]'.encode('utf-8').decode('utf-8'),

texts)

for sentence in sentences:

if sentence != ' ':

outp.write(sentence.strip() + '\n')

def data_to_pkl():

"""

将文本数据保存成二进制pkl文件

:return:

"""

def main():

# 数据清洗

origin_handle_entities()

#数据标注(字)

origin_handle_mark()

# 句子切分

sentence_split()

# 数据转换

data_to_pkl()

if name == ' main ':

main()

##################################################################################################

def load_data():

pickle_path = '../data_target_pkl/renmindata.pkl'

with open(pickle_path,'rb') as inp:

word2id,id2word,tag2id,id2tag,x_train,y_train,x_test,y_test,x_valid,y_valid =pickle.load(inp)

def main():

word2id = load_data()

print(len(word2id))

if name == ' main ':

main()

#######################################################################################

import torch

import torch.nn as nn

from torch.utils.data import Dataset # 批量读取数据

class NERDataSet(Dataset):

"""

X:表示样本,Y:表示标签

"""

def init (self,X,Y, args, *kwargs):

"""

class Config():

embedding_dim = 100 #词向量的维度

hidden_dim = 200

config = Config()

class NERLSTM_CRF(nn.Module):

"""

1、输入层

2、词映射(Embedding(vocab_size,embedding_dim))

3、LSTM

4、全连接层

"""

def init (self):

super(NERLSTM_CRF,self). init ()

self.embeding_dim = config.embeding_dim

self.hidden_dim = config.hidden_dim

self.vocab_size = config.vocab_size

self.num_tags = config.num_tags

##################################################

from torch.utils.data import DataLoader #批量加载数据

import torch

import torch.optim as op

def utils_to_train():

device = torch.device('cpu')

max_epoch = 1

batch_size = 32

num_workers =4 #开启几个线程取执行程序

def parse_tags(text,path):

id2tag = load_data()

tags = [id2tag[idx] for idx in path]

##################################################

from sklearn.metrics import classification_report,precision_score,recall_score,f1_score

word2id = load_data()[0]

max_epoch,device,train_data_loader,valid_data_loader,test_data_loader,model = utils_to_train()

class ChineseNER(object):

def train(self):

for epoch in range(max_epoch):