LSTM

程序员文章站 2024-03-24 23:27:16

...

import re
import numpy as np
import pandas as pd
from keras import Sequential
from keras.layers import Embedding, LSTM, Flatten, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

train_data = pd.read_csv('./data/train.csv', lineterminator='\n')
test_data = pd.read_csv('./data/test.csv', lineterminator='\n')
train_data['label'] = train_data['label'].map({'Negative': 0, 'Positive': 1})
train_data = train_data.as_matrix()
test_data = test_data.as_matrix()


# two commom ways to clean data
def cleaner(word):
    word = re.sub(r'\#\.', '', word)
    word = re.sub(r'\n', '', word)
    word = re.sub(r',', '', word)
    word = re.sub(r'\-', ' ', word)
    word = re.sub(r'\.', '', word)
    word = re.sub(r'\\', ' ', word)
    word = re.sub(r'\\x\.+', '', word)
    word = re.sub(r'\d', '', word)
    word = re.sub(r'^_.', '', word)
    word = re.sub(r'_', ' ', word)
    word = re.sub(r'^ ', '', word)
    word = re.sub(r' $', '', word)
    word = re.sub(r'\?', '', word)

    return word.lower()


def hashing(word):
    word = re.sub(r'ain$', r'ein', word)
    word = re.sub(r'ai', r'ae', word)
    word = re.sub(r'ay$', r'e', word)
    word = re.sub(r'ey$', r'e', word)
    word = re.sub(r'ie$', r'y', word)
    word = re.sub(r'^es', r'is', word)
    word = re.sub(r'a+', r'a', word)
    word = re.sub(r'j+', r'j', word)
    word = re.sub(r'd+', r'd', word)
    word = re.sub(r'u', r'o', word)
    word = re.sub(r'o+', r'o', word)
    word = re.sub(r'ee+', r'i', word)
    if not re.match(r'ar', word):
        word = re.sub(r'ar', r'r', word)
    word = re.sub(r'iy+', r'i', word)
    word = re.sub(r'ih+', r'eh', word)
    word = re.sub(r's+', r's', word)
    if re.search(r'[rst]y', 'word') and word[-1] != 'y':
        word = re.sub(r'y', r'i', word)
    if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
        word = re.sub(r'i$', r'y', word)
    if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
        word = re.sub(r'h', '', word)
    word = re.sub(r'k', r'q', word)
    return word


def array_cleaner(array):
    X = []
    for sentence in array:
        clean_sentence = ''
        words = sentence.split(' ')
        for word in words:
            clean_sentence = clean_sentence + ' ' + cleaner(word)
        X.append(clean_sentence)
    return X


X_test = test_data[:, 1]
X_train = train_data[:, 1]
X_train = array_cleaner(X_train)
X_test = array_cleaner(X_test)
y_train = np.array(train_data[:, 2], dtype=int)

X_all = X_train + X_test
tokenizer = Tokenizer(nb_words=2000, filters='!"#$%&()*+,-./:;<=>[email protected][\\]^_`{|}~\t\n', lower=True, split=' ')
tokenizer.fit_on_texts(X_all)
X_all = tokenizer.texts_to_sequences(X_all)
X_all = pad_sequences(X_all)
X_train = X_all[:len(y_train)]
X_test = X_all[len(y_train):]

embed_dim = 128
list_out = 256
batch_size = 32
model = Sequential()
model.add(Embedding(2000, embed_dim, input_length=X_train.shape[1], dropout=0.2))
model.add(LSTM(list_out, dropout=0.2, return_sequences=True))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, batch_size=batch_size, epochs=10)
y_pred = model.predict(X_test)
print(y_pred[:10])
result = pd.DataFrame.from_dict({
    'ID': range(1, len(y_pred) + 1),
    'Pred': y_pred.reshape((-1))
})
result.to_csv('./result/submission.csv', index=None)

上一篇：详细介绍php中的命名空间

下一篇：异常处理机制

LSTM

pytorch nn.LSTM()参数详解

LSTM

pytorch LSTM

Pytorch LSTM

pytorch 中的torch.nn.LSTM函数

LSTM时间序列预测

LSTM输入结构

创作能手——LSTM“诗人”

DL之LSTM：基于tensorflow框架利用LSTM算法对气温数据集训练并预测

循环神经网络—(RNN、LSTM、GRU)