python iterate or loop each row of a dataframe

Posted on 2018-09-27 | Edited on 2018-07-19 | In python

description: python iterate a dataframe

loop each row of a dataframe

import pandas
data = pandas.DataFrame([[1,2,3],[4,5,6],[7,8,9]])
for row in data.values:
    print(row)
# output 
# [1 2 3]
# [4 5 6]
# [7 8 9]

python assign value of dataframe by conditions

Posted on 2018-09-27 | Edited on 2018-07-14 | In python

description: python assign or set value if conditions match

python assign or set value based on conditions

import pandas

data = pandas.DataFrame({"test": [10, 20, 30, 40], "a": [1, 1, 1, 2], "b": [0, 2, 2, 2]})
# test  a  b
# 0    10  1  0
# 1    20  1  2
# 2    30  1  2
# 3    40  2  2

filter the data with condtion 1

tmp = data[data["a"] == 1]
# test  a  b
# 0    10  1  0
# 1    20  1  2
# 2    30  1  2

filter the data with condition 2

tmp = tmp[tmp["b"] == 2]
# test  a  b
# 1    20  1  2
# 2    30  1  2

get the index of the data

1 2	indexes = tmp.index # Int64Index([1, 2], dtype='int64')

change values by indexes and column name

data._set_value(indexes, 'test', 888888)
# test  a  b
# 0      10  1  0
# 1  888888  1  2
# 2  888888  1  2
# 3      40  2  2

python build small network example for ANN

Posted on 2018-09-27 | Edited on 2018-07-23 | In machine learning

description: build small network example in python

initial a network

def initialize_network(n_inputs, n_hidden, n_outputs):
	network = list()
	hidden_layer = [{'weights':[random() for i in range(n_inputs + 1)]} for i in range(n_hidden)]
	network.append(hidden_layer)
	output_layer = [{'weights':[random() for i in range(n_hidden + 1)]} for i in range(n_outputs)]
	network.append(output_layer)
	return network

test network

seed(1)
network = initialize_network(2, 1, 2)
for layer in network:
	print(layer)

output

1 2	[{'weights': [0.13436424411240122, 0.8474337369372327, 0.763774618976614]}] [{'weights': [0.2550690257394217, 0.49543508709194095]}, {'weights': [0.4494910647887381, 0.651592972722763]}]

python create simple MLP in Keras

Posted on 2018-09-27 | Edited on 2018-07-23 | In machine learning

description: create simple MLP in Keras

import packages

1
2
3

from keras.models import Sequential
from keras.layers import Dense
import numpy

fix random seed for reproducibility

1	numpy.random.seed(7)

load pima indians dataset

1	dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")

split into input (X) and output (Y) variables

1 2	X = dataset[:,0:8] Y = dataset[:,8]

create model

model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

Compile model

1	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Fit the model

1	model.fit(X, Y, epochs=150, batch_size=10)

evaluate the model

1 2	scores = model.evaluate(X, Y) print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

python simple convolutional neural network for MNIST dataset

Posted on 2018-09-27 | Edited on 2018-07-23 | In machine learning

description: simple CNN example for MNIST dataset

import packages

import numpy
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from keras import backend as K
K.set_image_dim_ordering('th')

fix random seed for reproducibility

1 2	seed = 7 numpy.random.seed(seed)

load data

(X_train, y_train), (X_test, y_test) = mnist.load_data()
# reshape to be [samples][pixels][width][height]
X_train = X_train.reshape(X_train.shape[0], 1, 28, 28).astype('float32')
X_test = X_test.reshape(X_test.shape[0], 1, 28, 28).astype('float32')

normalize inputs from 0-255 to 0-1

X_train = X_train / 255
X_test = X_test / 255
# one hot encode outputs
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
num_classes = y_test.shape[1]

define baseline model

def baseline_model():
	# create model
	model = Sequential()
	model.add(Conv2D(32, (5, 5), input_shape=(1, 28, 28), activation='relu'))
	model.add(MaxPooling2D(pool_size=(2, 2)))
	model.add(Dropout(0.2))
	model.add(Flatten())
	model.add(Dense(128, activation='relu'))
	model.add(Dense(num_classes, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

define model

# build the model
model = baseline_model()
# Fit the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=200, verbose=2)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Baseline Error: %.2f%%" % (100-scores[1]*100))

output

Train on 60000 samples, validate on 10000 samples

Epoch 1/3

69s - loss: 0.2498 - acc: 0.9283 - val_loss: 0.0715 - val_acc: 0.9803

Epoch 2/3

70s - loss: 0.0714 - acc: 0.9786 - val_loss: 0.0503 - val_acc: 0.9833

Epoch 3/3

a example for RL in python

Posted on 2018-09-27 | Edited on 2018-07-23 | In machine learning

description: a small example for reinforcement learning in python

import packages and set seed

import numpy as np
import pandas as pd
import time

np.random.seed(2)  # reproducible

initial parameters

NUMBER_OF_STATES = 6   # the length of the 1 dimensional world 
ACTIONS = ['left', 'right']     # available actions
EPSILON = 0.9   # greedy police 
ALPHA = 0.1     # learning rate
GAMMA = 0.9    # discount factor 
MAX_EPISODES = 13   # maximum episodes
FRESH_TIME = 0.3 # speed of actions

build q-table for actions

def build_q_table(number_of_states, actions):
    """
    just build a table with zero in the beginning
    :param number_of_states:
    :param actions:
    :return:
    """
    table = pd.DataFrame(
        np.zeros((number_of_states, len(actions))),     # q_table initial values
        columns=actions,    # actions's name
    )
    # print(table)    # show table
    return table

method to take action

def choose_action(state, q_table):
    # This is how to choose an action
    state_actions = q_table.iloc[state, :]
     # np.random.uniform() is a random Number
    if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):  # act non-greedy or state-action have no value
        action_name = np.random.choice(ACTIONS)
    else:   # act greedy
        action_name = state_actions.idxmax()    # replace argmax to idxmax as argmax means a different function in newer version of pandas
    return action_name

define the change of environment

def get_env_feedback(Current_State, Current_Action):
    """
    get the result based on the movement, right to +1, left to -1
    :param Current_State:
    :param Current_Action:
    :return: response 1 means End while response 0 means pass
    """
    # This is how agent will interact with the environment
    if Current_Action == 'right':    # move right
        if Current_State == NUMBER_OF_STATES - 2:   # terminate
            Next_State = 'terminal'
            Response = 1
        else:
            Next_State = Current_State + 1
            Response = 0
    else:   # move left
        Response = 0
        if Current_State == 0:
            Next_State = Current_State  # reach the wall
        else:
            Next_State = Current_State - 1
    return Next_State, Response

update environment

def update_env(state, episode, step_counter):
    """
    print the screen to the movement
    :param state:
    :param episode:
    :param step_counter:
    :return:
    """
    # This is how environment be updated
    env_list = ['-'] * (NUMBER_OF_STATES - 1) + ['T']   # '---------T' our environment
    if state == 'terminal':
        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
        print('\r{}'.format(interaction), end='')
        time.sleep(2)
        print('\r                                ', end='')
    else:
        env_list[state] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(FRESH_TIME)

main part of RL loop

def rl():
    q_table = build_q_table(NUMBER_OF_STATES, ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        current_state = 0
        is_terminated = False
        update_env(current_state, episode, step_counter)
        while not is_terminated:

            action = choose_action(current_state, q_table)
            next_state, R = get_env_feedback(current_state, action)  # take action & get next state and reward
            q_predict = q_table.loc[current_state, action]
            if next_state != 'terminal':
                q_target = R + GAMMA * q_table.iloc[next_state, :].max()   # next state is not terminal
            else:
                q_target = R     # next state is terminal
                is_terminated = True    # terminate this episode

            q_table.loc[current_state, action] += ALPHA * (q_target - q_predict)  # update
            current_state = next_state  # move to next state

            update_env(current_state, episode, step_counter+1)
            step_counter += 1
    return q_table

run the application

if __name__ == "__main__":
    q_table = rl()
    print('\r\nQ-table:\n')
    print(q_table)

simplest example of self-build LSTM in python

Posted on 2018-09-27 | Edited on 2018-07-27 | In machine learning

description: simple lstm example tensorflow

compute sigmoid nonlinearity

1
2
3

def sigmoid(x):
    output = 1 / (1 + np.exp(-x))
    return output

convert output of sigmoid function to its derivative

1 2	def sigmoid_output_to_derivative(output): return output * (1 - output)

training dataset generation

int2binary = {}
binary_dim = 8

largest_number = pow(2, binary_dim)
# generate binary table
binary = np.unpackbits(
    np.array([range(largest_number)], dtype=np.uint8).T, axis=1)
# int + binary array pair
for i in range(largest_number):
    int2binary[i] = binary[i]

input variables

alpha = 0.1
input_dim = 2
hidden_dim = 16
output_dim = 1

initialize neural network weights

synapse_0 = 2 * np.random.random((input_dim, hidden_dim)) - 1 
synapse_1 = 2 * np.random.random((hidden_dim, output_dim)) - 1
synapse_h = 2 * np.random.random((hidden_dim, hidden_dim)) - 1 

synapse_0_update = np.zeros_like(synapse_0)
synapse_1_update = np.zeros_like(synapse_1) 
synapse_h_update = np.zeros_like(synapse_h)

training logic # 训练的次数

for j in range(10000):

    # generate a simple addition problem (a + b = c)
    a_int = np.random.randint(largest_number / 2)  # int version
    a = int2binary[a_int]  # binary encoding like a: [0, 0, 0, 0, 1, 0, 0, 1]

    b_int = np.random.randint(largest_number / 2)  # int version
    b = int2binary[b_int]  # binary encoding like b: [0, 0, 1, 1, 1, 1, 0, 0]

    # true answer
    c_int = a_int + b_int
    c = int2binary[c_int]

    # where we'll store our best guess (binary encoded)
    d = np.zeros_like(c)  # like [0, 0, 0, 0, 0, 0, 0, 0]

    overallError = 0

    layer_2_deltas = list()
    layer_1_values = list()  # restore hidden layer for next timestep
    layer_1_values.append(np.zeros(hidden_dim))

    # moving along the positions in the binary encoding
    for position in range(binary_dim):
        # generate input and output
        X = np.array([[a[binary_dim - position - 1], b[binary_dim - position - 1]]])  # like [[1, 0]]
        y = np.array([[c[binary_dim - position - 1]]]).T  # [[1]] 
        # hidden layer (input ~+ prev_hidden) 
        layer_1 = sigmoid(np.dot(X, synapse_0) + np.dot(layer_1_values[-1], synapse_h))

        # output layer (new binary representation) 
        layer_2 = sigmoid(np.dot(layer_1, synapse_1)) 
        # calculate the difference and error
        layer_2_error = y - layer_2
        layer_2_deltas.append((layer_2_error) * sigmoid_output_to_derivative(layer_2)) 
        overallError += np.abs(layer_2_error[0]) 
        # decode estimate so we can print it out
        d[binary_dim - position - 1] = np.round(layer_2[0][0])

        # store hidden layer so we can use it in the next timestep
        layer_1_values.append(copy.deepcopy(layer_1))

    future_layer_1_delta = np.zeros(hidden_dim)

    # update weights
    for position in range(binary_dim):  
        X = np.array([[a[position], b[position]]])
        layer_1 = layer_1_values[-position - 1]
        prev_layer_1 = layer_1_values[-position - 2]

        # error at output layer
        layer_2_delta = layer_2_deltas[-position - 1]
        # error at hidden layer
        layer_1_delta = (future_layer_1_delta.dot(synapse_h.T) + \
                         layer_2_delta.dot(synapse_1.T)) * sigmoid_output_to_derivative(layer_1)
        # let's update all our weights so we can try again
        synapse_1_update += np.atleast_2d(layer_1).T.dot(layer_2_delta)
        synapse_h_update += np.atleast_2d(prev_layer_1).T.dot(layer_1_delta)
        synapse_0_update += X.T.dot(layer_1_delta)

        future_layer_1_delta = layer_1_delta

    synapse_0 += synapse_0_update * alpha
    synapse_1 += synapse_1_update * alpha
    synapse_h += synapse_h_update * alpha

    synapse_0_update *= 0
    synapse_1_update *= 0
    synapse_h_update *= 0

    # print out progress
    if (j % 1000 == 0):
        print("Error:" + str(overallError))
        print("Pred:" + str(d))
        print("True:" + str(c))
        out = 0
        for index, x in enumerate(reversed(d)):
            out += x * pow(2, index)
        print(str(a_int) + " + " + str(b_int) + " = " + str(out))
        print("------------")

output

Error:[3.45638663]
Pred:[0 0 0 0 0 0 0 1]
True:[0 1 0 0 0 1 0 1]
9 + 60 = 1
------------
Error:[3.63389116]
Pred:[1 1 1 1 1 1 1 1]
True:[0 0 1 1 1 1 1 1]
28 + 35 = 255
------------
Error:[3.91366595]
Pred:[0 1 0 0 1 0 0 0]
True:[1 0 1 0 0 0 0 0]
116 + 44 = 72
------------
Error:[3.72191702]
Pred:[1 1 0 1 1 1 1 1]
True:[0 1 0 0 1 1 0 1]
4 + 73 = 223
------------
Error:[3.5852713]
Pred:[0 0 0 0 1 0 0 0]
True:[0 1 0 1 0 0 1 0]
71 + 11 = 8
------------
Error:[2.53352328]
Pred:[1 0 1 0 0 0 1 0]
True:[1 1 0 0 0 0 1 0]
81 + 113 = 162
------------
Error:[0.57691441]
Pred:[0 1 0 1 0 0 0 1]
True:[0 1 0 1 0 0 0 1]
81 + 0 = 81
------------
Error:[1.42589952]
Pred:[1 0 0 0 0 0 0 1]
True:[1 0 0 0 0 0 0 1]
4 + 125 = 129
------------
Error:[0.47477457]
Pred:[0 0 1 1 1 0 0 0]
True:[0 0 1 1 1 0 0 0]
39 + 17 = 56
------------
Error:[0.21595037]
Pred:[0 0 0 0 1 1 1 0]
True:[0 0 0 0 1 1 1 0]
11 + 3 = 14
------------

reference

https://blog.csdn.net/zzukun/article/details/49968129

python to use LSTM for predicting words in Txt

Posted on 2018-09-27 | Edited on 2018-07-23 | In machine learning

description: use LSTM to predict words in Text in python

import packages

import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

load ascii text and covert to lowercase

1
2
3

filename = "wonderland.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

create mapping of unique chars to integers

1 2	chars = sorted(list(set(raw_text))) char_to_int = dict((c, i) for i, c in enumerate(chars))

summarize the loaded data

# Total Characters:  147674
# Total Vocab:  47
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

split the book text up into subsequences with a fixed length of 100 characters

seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i: i+seq_length]
	seq_out = raw_text[ i+seq_length ]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
# Total Patterns:  147574
print( "Total Patterns: ", n_patterns)

reshape X to be [samples, time steps, features]

1	X = numpy.reshape(dataX, (n_patterns, seq_length, 1))

normalize

1	X = X / float(n_vocab)

one hot encode the output variable

1	y = np_utils.to_categorical(dataY)

define the LSTM model

model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
# y.shape (144312, 47)
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

define the checkpoint

1
2
3

filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

fit the model

1	model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

python ANN forward Propagate

Posted on 2018-09-27 | Edited on 2018-07-23 | In machine learning

Calculate neuron activation for an input

def activate(weights, inputs):
	activation = weights[-1]
	for i in range(len(weights)-1):
		activation += weights[i] * inputs[i]
	return activation

Transfer neuron activation

1
2
3

from math import exp
def transfer(activation):
	return 1.0 / (1.0 + exp(-activation))

Forward propagate input to a network output

def forward_propagate(network, row):
	inputs = row
	for layer in network:
		new_inputs = []
		for neuron in layer:
			activation = activate(neuron['weights'], inputs)
			neuron['output'] = transfer(activation)
			new_inputs.append(neuron['output'])
		inputs = new_inputs
	return inputs

test forward propagation

network = [[{'weights': [0.13436424411240122, 0.8474337369372327, 0.763774618976614]}],
		[{'weights': [0.2550690257394217, 0.49543508709194095]}, {'weights': [0.4494910647887381, 0.651592972722763]}]]
row = [1, 0, None]
output = forward_propagate(network, row)
print(output)

output

1	[0.6629970129852887, 0.7253160725279748]

python build Decision Tree on bank note dataset

Posted on 2018-09-27 | Edited on 2018-07-23 | In machine learning

description: build Decision Tree from bank note dataset in python

CART on the Bank Note dataset

1
2
3

from random import seed
from random import randrange
from csv import reader

Load a CSV file

def load_csv(filename):
	file = open(filename, "r")
	lines = reader(file)
	dataset = list(lines)
	return dataset

Convert string column to float

1
2
3

def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

Split a dataset into k folds

def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

Calculate accuracy percentage

def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

Evaluate an algorithm using a cross validation split

def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

Split a dataset based on an attribute and an attribute value

def test_split(index, value, dataset):
	left, right = list(), list()
	for row in dataset:
		if row[index] < value:
			left.append(row)
		else:
			right.append(row)
	return left, right

Calculate the Gini index for a split dataset

def gini_index(groups, classes):
	# count all samples at split point
	n_instances = float(sum([len(group) for group in groups]))
	# sum weighted Gini index for each group
	gini = 0.0
	for group in groups:
		size = float(len(group))
		# avoid divide by zero
		if size == 0:
			continue
		score = 0.0
		# score the group based on the score for each class
		for class_val in classes:
			p = [row[-1] for row in group].count(class_val) / size
			score += p * p
		# weight the group score by its relative size
		gini += (1.0 - score) * (size / n_instances)
	return gini

Select the best split point for a dataset

def get_split(dataset):
	class_values = list(set(row[-1] for row in dataset))
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	for index in range(len(dataset[0])-1):
		for row in dataset:
			groups = test_split(index, row[index], dataset)
			gini = gini_index(groups, class_values)
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
	return {'index':b_index, 'value':b_value, 'groups':b_groups}

Create a terminal node value

1
2
3

def to_terminal(group):
	outcomes = [row[-1] for row in group]
	return max(set(outcomes), key=outcomes.count)

Create child splits for a node or make terminal

def split(node, max_depth, min_size, depth):
	left, right = node['groups']
	del(node['groups'])
	# check for a no split
	if not left or not right:
		node['left'] = node['right'] = to_terminal(left + right)
		return
	# check for max depth
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return
	# process left child
	if len(left) <= min_size:
		node['left'] = to_terminal(left)
	else:
		node['left'] = get_split(left)
		split(node['left'], max_depth, min_size, depth+1)
	# process right child
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	else:
		node['right'] = get_split(right)
		split(node['right'], max_depth, min_size, depth+1)

Build a decision tree

def build_tree(train, max_depth, min_size):
	root = get_split(train)
	split(root, max_depth, min_size, 1)
	return root

Make a prediction with a decision tree

def predict(node, row):
	if row[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return predict(node['left'], row)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return predict(node['right'], row)
		else:
			return node['right']

Classification and Regression Tree Algorithm

def decision_tree(train, test, max_depth, min_size):
	tree = build_tree(train, max_depth, min_size)
	predictions = list()
	for row in test:
		prediction = predict(tree, row)
		predictions.append(prediction)
	return(predictions)

Test CART on Bank Note dataset

seed(1)

load and prepare data

1 2	filename = 'data_banknote_authentication.csv' dataset = load_csv(filename)

convert string attributes to integers

1 2	for i in range(len(dataset[0])): str_column_to_float(dataset, i)

evaluate algorithm

n_folds = 5
max_depth = 5
min_size = 10
scores = evaluate_algorithm(dataset, decision_tree, n_folds, max_depth, min_size)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

output

1
2
3

Scores: [100.0, 100.0, 100.0, 100.0, 100.0]

Mean Accuracy: 100.000%

killfun

Personal blog for collecting useful information

RSS