Python深度学习笔记08--处理文本数据的常用方法

2021-01-25 15:02:34 阅读：121 来源： 互联网

标签：index plt loss Python max 08 -- samples model

6.1 处理文本数据

6.1.1 单词和字符的one-hot编码

(1)单词级的one-hot编码：

 1 # 单词级的one-hot编码
 2 import numpy as np
 3 
 4 # 初始数据：每个样本是列表的一个元素(本例中的样本是一个句子，但也可以是一整篇文档)
 5 samples = ['The cat sat on the mat.', 'The dog ate my homework.']
 6 
 7 # 构建数据中所有标记的索引
 8 token_index = {}
 9 for sample in samples:
10     # 利用split方法对样本进行分词，在实际应用中，还需要从样本中去掉标点和特殊符号。
11     for word in sample.split():
12         if word not in token_index:
13             # 为每一个唯一的单词分配一个唯一的索引            
14             token_index[word] = len(token_index) + 1
15             # 0号索引没有分配给任何单词            
16 
17 # 对样本进行分词。只考虑每个样本前max_length个单词
18 max_length = 10
19 
20 # 将结果保存在result中
21 # result是一个3D张量，第一维(高)是样本个数，第二维(行)是某样本第几个单词，
22 # 第三维(列)是这个单词的向量表示
23 results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
24 for i, sample in enumerate(samples):
25     for j, word in list(enumerate(sample.split()))[:max_length]:
26         index = token_index.get(word)
27         results[i, j, index] = 1.#将3D向量中出现的单词标记为1.

(2)字符级的one-hot编码：

 1 import string
 2 import numpy as np
 3 
 4 samples = ['The cat sat on the mat.', 'The dog ate my homework.']
 5 characters = string.printable  # 所有可打印的ASCII字符
 6 token_index = dict(zip(characters, range(1, len(characters) + 1)))
 7 
 8 max_length = 50
 9 results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
10 for i, sample in enumerate(samples):
11     for j, character in enumerate(sample[:max_length]):
12         index = token_index.get(character)
13         results[i, j, index] = 1.

(3)使用Keras实现单词级的one-hot编码：

 1 from keras.preprocessing.text import Tokenizer
 2 
 3 samples = ['The cat sat on the mat.', 'The dog ate my homework.']
 4 
 5 # 创建一个分词器，设置为只考虑前1000个最常见的单词
 6 tokenizer = Tokenizer(num_words=1000)
 7 # 构建单词索引
 8 tokenizer.fit_on_texts(samples)
 9 
10 # 将字符串转换为整数索引组成的列表
11 sequences = tokenizer.texts_to_sequences(samples)
12 
13 # 也可以直接得到one-hot二进制表示。这个分词器也支持除one-hot编码外的其他向量化模式
14 one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
15 
16 # 找回单词索引
17 word_index = tokenizer.word_index
18 print('Found %s unique tokens.' % len(word_index))

(4)使用散列技巧的单词级的one-hot编码：

 1 import numpy as np
 2 
 3 samples = ['The cat sat on the mat.', 'The dog ate my homework.']
 4 
 5 # 将单词保存为长度为1000的向量。如果单词数量接近1000个(或更多)，
 6 # 那么会遇到很多散列冲突，这回降低这种编码方法的准确性
 7 dimensionality = 1000
 8 max_length = 10
 9 
10 results = np.zeros((len(samples), max_length, dimensionality))
11 for i, sample in enumerate(samples):
12     for j, word in list(enumerate(sample.split()))[:max_length]:
13         # 将单词散列为0~1000范围内的一个随机整数索引
14         index = abs(hash(word)) % dimensionality
15         results[i, j, index] = 1.

6.1.2 使用词嵌入

(1)利用Embedding层学习词嵌入：

 1 from keras.layers import Embedding
 2 
 3 # Embedding层至少需要两个参数：标记的个数(这里是1000，即最大单词索引+1)
 4 # 和嵌入的维度(这里是64)
 5 embedding_layer = Embedding(1000, 64)
 6 
 7 from keras.datasets import imdb
 8 from keras import preprocessing
 9 
10 # 作为特征的单词个数
11 max_features = 10000
12 # 超出的单词会被截断 
13 # (这些单词是最常见单词)
14 maxlen = 20
15 
16 # 加载数据，整数列表
17 (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
18 
19 # 将整数列表转换为(samples, maxlen)的2D张量
20 x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
21 x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
22 
23 
24 from keras.models import Sequential
25 from keras.layers import Flatten, Dense
26 
27 model = Sequential()
28 # 指定Embedding层的最大输入长度，以便后面将嵌入输入展平。
29 
30 model.add(Embedding(max_features, 8, input_length=maxlen))
31 # Embedding层激活的形状为(samples, maxlen, 8)
32 
33 # 将3D的嵌入张量展平成形状(samples, maxlen * 8)的2D张量
34 model.add(Flatten())
35 
36 # 添加分类器
37 model.add(Dense(1, activation='sigmoid'))
38 model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
39 # model.summary()
40 
41 history = model.fit(x_train, y_train,
42                     epochs=10,
43                     batch_size=32,
44                     validation_split=0.2)

(2)使用预训练的词嵌入：

  1 import os
  2 
  3 imdb_dir = '/home/ubuntu/data/aclImdb'
  4 train_dir = os.path.join(imdb_dir, 'train')
  5 
  6 labels = []
  7 texts = []
  8 
  9 for label_type in ['neg', 'pos']:
 10     dir_name = os.path.join(train_dir, label_type)
 11     for fname in os.listdir(dir_name):
 12         if fname[-4:] == '.txt':
 13             f = open(os.path.join(dir_name, fname))
 14             texts.append(f.read())
 15             f.close()
 16             if label_type == 'neg':
 17                 labels.append(0)
 18             else:
 19                 labels.append(1)
 20 
 21 from keras.preprocessing.text import Tokenizer
 22 from keras.preprocessing.sequence import pad_sequences
 23 import numpy as np
 24 
 25 maxlen = 100  # We will cut reviews after 100 words
 26 training_samples = 200  # We will be training on 200 samples
 27 validation_samples = 10000  # We will be validating on 10000 samples
 28 max_words = 10000  # We will only consider the top 10,000 words in the dataset
 29 
 30 tokenizer = Tokenizer(num_words=max_words)
 31 tokenizer.fit_on_texts(texts)
 32 sequences = tokenizer.texts_to_sequences(texts)
 33 
 34 word_index = tokenizer.word_index
 35 print('Found %s unique tokens.' % len(word_index))
 36 
 37 data = pad_sequences(sequences, maxlen=maxlen)
 38 
 39 labels = np.asarray(labels)
 40 print('Shape of data tensor:', data.shape)
 41 print('Shape of label tensor:', labels.shape)
 42 
 43 # Split the data into a training set and a validation set
 44 # But first, shuffle the data, since we started from data
 45 # where sample are ordered (all negative first, then all positive).
 46 indices = np.arange(data.shape[0])
 47 np.random.shuffle(indices)
 48 data = data[indices]
 49 labels = labels[indices]
 50 
 51 x_train = data[:training_samples]
 52 y_train = labels[:training_samples]
 53 x_val = data[training_samples: training_samples + validation_samples]
 54 y_val = labels[training_samples: training_samples + validation_samples]
 55 
 56 glove_dir = '/home/ubuntu/data/'
 57 
 58 embeddings_index = {}
 59 f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
 60 for line in f:
 61     values = line.split()
 62     word = values[0]
 63     coefs = np.asarray(values[1:], dtype='float32')
 64     embeddings_index[word] = coefs
 65 f.close()
 66 
 67 print('Found %s word vectors.' % len(embeddings_index))
 68 
 69 embedding_dim = 100
 70 
 71 embedding_matrix = np.zeros((max_words, embedding_dim))
 72 for word, i in word_index.items():
 73     embedding_vector = embeddings_index.get(word)
 74     if i < max_words:
 75         if embedding_vector is not None:
 76             # Words not found in embedding index will be all-zeros.
 77             embedding_matrix[i] = embedding_vector
 78 
 79 from keras.models import Sequential
 80 from keras.layers import Embedding, Flatten, Dense
 81 
 82 model = Sequential()
 83 model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
 84 model.add(Flatten())
 85 model.add(Dense(32, activation='relu'))
 86 model.add(Dense(1, activation='sigmoid'))
 87 model.summary()
 88 
 89 model.layers[0].set_weights([embedding_matrix])
 90 model.layers[0].trainable = False
 91 
 92 model.compile(optimizer='rmsprop',
 93               loss='binary_crossentropy',
 94               metrics=['acc'])
 95 history = model.fit(x_train, y_train,
 96                     epochs=10,
 97                     batch_size=32,
 98                     validation_data=(x_val, y_val))
 99 model.save_weights('pre_trained_glove_model.h5')
100 
101 import matplotlib.pyplot as plt
102 
103 acc = history.history['acc']
104 val_acc = history.history['val_acc']
105 loss = history.history['loss']
106 val_loss = history.history['val_loss']
107 
108 epochs = range(1, len(acc) + 1)
109 
110 plt.plot(epochs, acc, 'bo', label='Training acc')
111 plt.plot(epochs, val_acc, 'b', label='Validation acc')
112 plt.title('Training and validation accuracy')
113 plt.legend()
114 
115 plt.figure()
116 
117 plt.plot(epochs, loss, 'bo', label='Training loss')
118 plt.plot(epochs, val_loss, 'b', label='Validation loss')
119 plt.title('Training and validation loss')
120 plt.legend()
121 
122 plt.show()
123 
124 from keras.models import Sequential
125 from keras.layers import Embedding, Flatten, Dense
126 
127 model = Sequential()
128 model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
129 model.add(Flatten())
130 model.add(Dense(32, activation='relu'))
131 model.add(Dense(1, activation='sigmoid'))
132 model.summary()
133 
134 model.compile(optimizer='rmsprop',
135               loss='binary_crossentropy',
136               metrics=['acc'])
137 history = model.fit(x_train, y_train,
138                     epochs=10,
139                     batch_size=32,
140                     validation_data=(x_val, y_val))
141 
142 acc = history.history['acc']
143 val_acc = history.history['val_acc']
144 loss = history.history['loss']
145 val_loss = history.history['val_loss']
146 
147 epochs = range(1, len(acc) + 1)
148 
149 plt.plot(epochs, acc, 'bo', label='Training acc')
150 plt.plot(epochs, val_acc, 'b', label='Validation acc')
151 plt.title('Training and validation accuracy')
152 plt.legend()
153 
154 plt.figure()
155 
156 plt.plot(epochs, loss, 'bo', label='Training loss')
157 plt.plot(epochs, val_loss, 'b', label='Validation loss')
158 plt.title('Training and validation loss')
159 plt.legend()
160 
161 plt.show()
162 
163 
164 test_dir = os.path.join(imdb_dir, 'test')
165 
166 labels = []
167 texts = []
168 
169 for label_type in ['neg', 'pos']:
170     dir_name = os.path.join(test_dir, label_type)
171     for fname in sorted(os.listdir(dir_name)):
172         if fname[-4:] == '.txt':
173             f = open(os.path.join(dir_name, fname))
174             texts.append(f.read())
175             f.close()
176             if label_type == 'neg':
177                 labels.append(0)
178             else:
179                 labels.append(1)
180 
181 sequences = tokenizer.texts_to_sequences(texts)
182 x_test = pad_sequences(sequences, maxlen=maxlen)
183 y_test = np.asarray(labels)
184 
185 
186 model.load_weights('pre_trained_glove_model.h5')
187 model.evaluate(x_test, y_test)

标签：index,plt,loss,Python,max,08,--,samples,model
来源： https://www.cnblogs.com/asenyang/p/14325257.html

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

Python深度学习笔记08--处理文本数据的常用方法