2017-10-04 22:44:09 cdxy LSTM,RNN,WAF

In [1]:

import sys
import os
import json
import pandas
import numpy
import optparse
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict

Using TensorFlow backend.

In [2]:

dataframe = pandas.read_csv('./data/dev-access.csv', engine='python', quotechar='|', header=None)

In [3]:

dataframe.head(10)

Out[3]:

	0	1
0	{"timestamp":1502738402847,"method":"post","qu...	0
1	{"timestamp":1502738402849,"method":"post","qu...	0
2	{"timestamp":1502738402852,"method":"post","qu...	0
3	{"timestamp":1502738402852,"method":"post","qu...	0
4	{"timestamp":1502738402853,"method":"post","qu...	0
5	{"timestamp":1502738402853,"method":"post","qu...	0
6	{"timestamp":1502738402854,"method":"post","qu...	0
7	{"timestamp":1502738402855,"method":"post","qu...	0
8	{"timestamp":1502738402856,"method":"post","qu...	0
9	{"timestamp":1502738402856,"method":"post","qu...	0

In [4]:

dataset = dataframe.sample(frac=1).values

In [5]:

# Preprocess dataset
X = dataset[:,0]
Y = dataset[:,1]

In [6]:

Out[6]:

array([ '{"timestamp":1502738602036,"method":"get","query":{"query":"Tops&_method=PUT"},"path":"/search","statusCode":404,"source":{"remoteAddress":"22.148.143.9","referer":"http://localhost:8002/enter"},"route":"/search","headers":{"host":"localhost:8002","connection":"keep-alive","accept":"*/*","cache-control":"no-cache","x-requested-with":"XMLHttpRequest","referer":"http://localhost:8002/enter","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not Found","message":"Not Found"}}',
       '{"timestamp":1502738461497,"method":"get","query":{"query":"etudzum"},"path":"/search","statusCode":404,"source":{"remoteAddress":"81.27.152.121"},"route":"/search","headers":{"host":"localhost:8002","connection":"keep-alive","cache-control":"no-cache","accept":"*/*","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not Found","message":"Not Found"}}',
       '{"timestamp":1502738585782,"method":"get","query":{"query":"Area & Accent Rugs/*"},"path"quot;:"/search","statusCode":404,"source":{"remoteAddress":"251.8.39.54"},"route":"/search","headers":{"host":"localhost:8002","connection":"keep-alive","cache-control":"no-cache","accept":"*/*","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not Found","message":"Not Found"}}',
       ...,
       '{"timestamp":1502738590561,"method":"get","query":{"query":"Watering Equipment&_method=POST&isAdmin=true"},"path":"/search","statusCode":404,"source":{"remoteAddress":"99.98.90.102","referer":"http://localhost:8002/enter"},"route":"/search","headers":{"host":"localhost:8002","connection":"keep-alive","accept":"*/*","cache-control":"no-cache","x-requested-with":"XMLHttpRequest","referer":"http://localhost:8002/enter","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not Found","message":"Not Found"}}',
       '{"timestamp":1502738403024,"method":"get","query":{},"path":"/PMA2013","statusCode":404,"source":{"remoteAddress":"243.15.81.191","referer":"http://localhost:8002/enter"},"route":"/{p*}","headers":{"host":"localhost:8002","connection":"keep-alive","accept":"*/*","cache-control":"no-cache","x-requested-with":"XMLHttpRequest","referer":"http://localhost:8002/enter","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not Found","message":"Not Found"}}',
       '{"timestamp":1502738643645,"method":"post","query":{},"path":"/login","statusCode":200,"source":{"remoteAddress":"77.11.74.111"},"route":"/login","headers":{"host":"localhost:8002","connection":"keep-alive","cache-control":"no-cache","accept":"*/*","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6","content-type":"application/json","content-length":"52"},"requestPayload":{"username":"Herb2","password":"pizzaloaflasdf0v32"},"responsePayload":"LOGIN"}'], dtype=object)

In [7]:

Out[7]:

array([1, 0, 1, ..., 1, 1, 0], dtype=object)

In [8]:

for index, item in enumerate(X):
        # Quick hack to space out json elements
        reqJson = json.loads(item, object_pairs_hook=OrderedDict)
        del reqJson['timestamp']
        del reqJson['headers']
        del reqJson['source']
        del reqJson['route']
        del reqJson['responsePayload']
        X[index] = json.dumps(reqJson, separators=(',', ':'))

In [9]:

Out[9]:

array([ '{"method":"get","query":{"query":"Tops&_method=PUT"},"path":"/search","statusCode":404,"requestPayload":null}',
       '{"method":"get","query":{"query":"etudzum"},"path":"/search","statusCode":404,"requestPayload":null}',
       '{"method":"get","query":{"query":"Area & Accent Rugs/*"},"path":"/search","statusCode":404,"requestPayload":null}',
       ...,
       '{"method":"get","query":{"query":"Watering Equipment&_method=POST&isAdmin=true"},"path":"/search","statusCode":404,"requestPayload":null}',
       '{"method":"get","query":{},"path":"/PMA2013","statusCode":404,"requestPayload":null}',
       '{"method":"post","query":{},"path":"/login","statusCode":200,"requestPayload":{"username":"Herb2","password":"pizzaloaflasdf0v32"}}'], dtype=object)

Tokenize¶

In [10]:

tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

In [11]:

# Extract and save word dictionary
word_dict_file = 'build/word-dictionary.json'

if not os.path.exists(os.path.dirname(word_dict_file)):
    os.makedirs(os.path.dirname(word_dict_file))

with open(word_dict_file, 'w') as outfile:
    json.dump(tokenizer.word_index, outfile, ensure_ascii=False)

In [12]:

num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)
X[0]

Out[12]:

[16,
 1,
 19,
 2,
 3,
 13,
 7,
 11,
 1,
 4,
 1,
 28,
 2,
 3,
 1,
 10,
 1,
 15,
 8,
 2,
 9,
 14,
 1,
 4,
 16,
 1,
 15,
 8,
 2,
 9,
 14,
 1,
 4,
 1,
 36,
 7,
 18,
 6,
 32,
 62,
 19,
 2,
 3,
 13,
 7,
 11,
 37,
 26,
 72,
 36,
 1,
 17,
 10,
 1,
 18,
 5,
 3,
 13,
 1,
 4,
 1,
 24,
 6,
 2,
 5,
 9,
 23,
 13,
 1,
 10,
 1,
 6,
 3,
 5,
 3,
 8,
 6,
 25,
 7,
 11,
 2,
 1,
 4,
 22,
 20,
 22,
 10,
 1,
 9,
 2,
 15,
 8,
 2,
 6,
 3,
 26,
 5,
 14,
 12,
 7,
 5,
 11,
 1,
 4,
 21,
 8,
 12,
 12,
 17]

In [13]:

max_log_length = 1024
train_size = int(len(dataset) * .75)

In [14]:

# padding
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)
# 划分样本集
X_train, X_test = X_processed[0:train_size], X_processed[train_size:len(X_processed)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(Y)]

In [15]:

tb_callback = TensorBoard(log_dir='./logs', embeddings_freq=1)

Model¶

In [16]:

model = Sequential()
model.add(Embedding(num_words, 32, input_length=max_log_length))
model.add(Dropout(0.5))
model.add(LSTM(64, recurrent_dropout=0.5))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 1024, 32)          2816      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024, 32)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
=================================================================
Total params: 27,713
Trainable params: 27,713
Non-trainable params: 0
_________________________________________________________________

In [18]:

model.fit(X_train, Y_train, validation_split=0.25, epochs=3, batch_size=128, callbacks=[tb_callback])

Train on 15059 samples, validate on 5020 samples
Epoch 1/3
15059/15059 [==============================] - 452s - loss: 0.5981 - acc: 0.6662 - val_loss: 0.3296 - val_acc: 0.8990
Epoch 2/3
15059/15059 [==============================] - 454s - loss: 0.2738 - acc: 0.9179 - val_loss: 0.1216 - val_acc: 0.9783
Epoch 3/3
15059/15059 [==============================] - 621s - loss: 0.1647 - acc: 0.9618 - val_loss: 0.0631 - val_acc: 0.9914

Out[18]:

<keras.callbacks.History at 0x12772c890>

In [19]:

# Evaluate model
score, acc = model.evaluate(X_test, Y_test, verbose=1, batch_size=128)
print("Model Accuracy: {:0.2f}%".format(acc * 100))

6694/6694 [==============================] - 64s    
Model Accuracy: 99.12%

In [20]:

# Save model
model.save_weights('securitai-lstm-weights.h5')
model.save('securitai-lstm-model.h5')
with open('securitai-lstm-model.json', 'w') as outfile:
    outfile.write(model.to_json())

Evaluate 2¶

测试另外一个WAF的数据集 https://github.com/faizann24/Fwaf-Machine-Learning-driven-Web-Application-Firewall

In [21]:

df_black = pandas.read_csv('/Users/xy/workspace/Fwaf-Machine-Learning-driven-Web-Application-Firewall/badqueries.txt',engine='python',sep='!@#$%^&*',header=0)
df_white = pandas.read_csv('/Users/xy/workspace/Fwaf-Machine-Learning-driven-Web-Application-Firewall/goodqueries.txt',engine='python',sep='!@#$%^&*',header=0).sample(n=50000)

In [22]:

df_black['label'] = 1
df_white['label'] = 0
new_dataset = df_black.append(df_white)

In [23]:

new_dataset = new_dataset.sample(n=10000)

In [24]:

X_waf = new_dataset['uri'].values.astype('str')
Y_waf = new_dataset['label'].values.astype('str')

In [25]:

X_sequences = tokenizer.texts_to_sequences(X_waf)
X_processed = sequence.pad_sequences(X_sequences, maxlen=max_log_length)

In [26]:

# Evaluate model
score, acc = model.evaluate(X_processed, Y_waf, verbose=1, batch_size=128)
print("Model Accuracy: {:0.2f}%".format(acc * 100))

10000/10000 [==============================] - 97s    
Model Accuracy: 73.52%

cdxy.me
Cyber Security / Data Science / Trading

LSTM识别恶意HTTP请求

[Data Science for Cyber Security]

Tokenize¶

Model¶

Evaluate 2¶

Ref¶