import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
boston = load_boston()
df = pd.DataFrame(boston.data)
df.columns = boston.feature_names
df['Price'] = boston.target
# Input Data
x = boston.data
# Output Data
y = boston.target
# splitting data to training and testing dataset.
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size =0.2,
random_state = 0)
print("xtrain shape : ", xtrain.shape)
print("xtest shape : ", xtest.shape)
print("ytrain shape : ", ytrain.shape)
print("ytest shape : ", ytest.shape)
# Fitting Multi Linear regression model to training model
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(xtrain, ytrain)
# predicting the test set results
y_pred = regressor.predict(xtest)
# Plotting Scatter graph to show the prediction
# results - 'ytrue' value vs 'y_pred' value
plt.scatter(ytest, y_pred, c = 'green')
plt.xlabel("Price: in $1000's")
plt.ylabel("Predicted value")
plt.title("True value vs predicted value : Linear Regression")
plt.show()
# Results of Linear Regression.
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(ytest, y_pred)
print("Mean Square Error : ", mse)
1st Try:
import pandas as pd
import numpy as np
import time
# import os
# import webbrowser
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import CuDNNLSTM
from keras.layers import Dropout
from sklearn.preprocessing import MinMaxScaler
import dask.dataframe as dd
train = dd.read_csv('train.csv', engine='c', low_memory=False, parse_dates=['click_time','attributed_time']).fillna(0)
test = pd.read_csv('test.csv', engine='c').fillna(0)
# unix = pd.to_datetime(train['click_time'])
# train['click_time'] = unix.view('int64') / pd.Timedelta(1, unit='s')
def throwaway():
global train
# print(train.head())
print(test.info())
throwaway()
def dataPreProcessTime(df):
# df['click_time'] = pd.to_datetime(df['click_time']).dt.date
df['click_time'] = df['click_time'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
return df
def preprocess():
global train
global train_scaled
global X_train
global Y_train
scaler = MinMaxScaler(feature_range=(0,1))
train_scaled = scaler.fit_transform(train)
X_train = []
Y_train = []
for i in range(100000, 184903889):
X_train.append(train_scaled[i-100000:i, 0])
Y_train.append(train_scaled[i, 0])
X_train, Y_train = np.array(X_train), np.array(Y_train)
X_train = np.reshape(X_train, X_train.shape[1], 8)
model = Sequential()
model.add(CuDNNLSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 8)))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(50, return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(50, return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(1, return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(loss="mean_squared_error", optimizer="RMSprop")
model.fit(X_train, Y_train, epochs=100, batch_size=5000, verbose=2)
test_error_rate = model.evaluate(X_test, Y_test, verbose=0)
print("The mean squared error (MSE) for the test data set is: {}".format(test_error_rate))
model.save("trained_model.h5")
print("Model saved to disk.")
def nn():
X = pd.read_csv('scaled_train.csv').values
df = pd.read_csv('train.csv')
Y = df[['SalePrice']].values
model = Sequential()
model.add(Dense(50, input_dim=208, activation='relu', name='Input_Layer'))
model.add(Dense(100, activation='relu', name='Hidden_Layer1'))
model.add(Dense(50, activation='relu', name='Hidden_Layer2'))
model.add(Dense(1, activation='linear', name='Output_Layer'))
model.compile(loss="mean_squared_error", optimizer="adam")
model.fit(X, Y, epochs=50, shuffle=True, verbose=2)
X_test = pd.read_csv('scaled_test.csv')
Y_test = Y[0:1459]
test_error_rate = model.evaluate(X_test, Y_test, verbose=0)
print("The mean squared error (MSE) for the test data set is: {}".format(test_error_rate))
model.save("trained_model.h5")
print("Model saved to disk.")
# nn()
2nd Try:
import pandas as pd
import numpy as np
import time
# import os
# import webbrowser
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import CuDNNLSTM
from keras.layers import Dropout
from sklearn.preprocessing import MinMaxScaler
import dask.dataframe as dd
train = pd.read_csv('test_small.csv', engine='c', low_memory=False).fillna(0)
test = pd.read_csv('train_small.csv', engine='c', low_memory=False).fillna(0)
# test = pd.read_csv('test.csv', engine='c', low_memory=False).fillna(0)
# unix = pd.to_datetime(train['click_time'])
# train['click_time'] = unix.view('int64') / pd.Timedelta(1, unit='s')
def throwaway():
global train
# print(train.head())
print(test.info())
# throwaway()
def dataPreProcessTime(df):
df['click_time'] = pd.to_datetime(df['click_time']).dt.date
df['click_time'] = df['click_time'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
return df
def feature_engineering():
#How many unique ips are there?
#Time difference between unique ips?
#write NN functions for easy use like sentdex
pass
train = dataPreProcessTime(train)
test = dataPreProcessTime(test)
def preprocess():
global train
global test
global train_scaled
# train['click_time'] = train['click_time'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
# train['attributed_time'] = train['attributed_time'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
# test['click_time'] = test['click_time'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
scaler = MinMaxScaler(feature_range=(0,1))
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.fit(test)
scaled_train = pd.DataFrame(train_scaled, columns=train_scaled.columns.values)
scaled_test = pd.DataFrame(test_scaled, columns=test_scaled.columns.values)
scaled_train.to_csv('scaled_train.csv')
scaled_test.to_csv('scaled_test.csv')
preprocess()
def nn():
X = pd.read_csv('scaled_train.csv').values
df = pd.read_csv('train_small.csv')
Y = df[['is_attributed']].values
model = Sequential()
model.add(CuDNNLSTM(50, return_sequences=True, input_shape=(X.shape[1], 8)))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(50, return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(50, return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(1, return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(loss="mean_squared_error", optimizer="RMSprop")
model.fit(X, Y, epochs=100, batch_size=5000, verbose=2)
X_test = pd.read_csv('scaled_test.csv')
Y_test = Y
test_error_rate = model.evaluate(X_test, Y_test, verbose=0)
print("The mean squared error (MSE) for the test data set is: {}".format(test_error_rate))
model.save("trained_model.h5")
print("Model saved to disk.")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
dataset = pd.read_csv(url, names=names)
# dataset.to_csv('iris.csv')
X = dataset.iloc[:, 0:4].values
print(X)
y = dataset.iloc[:, 4].values
print(y)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
def main():
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
lda = LDA(n_components=1)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy' + str(accuracy_score(y_test, y_pred)))
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn import tree
# https://www.kdnuggets.com/2018/01/managing-machine-learning-workflows-scikit-learn-pipelines-part-2.html
# Load and split the data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
# Construct pipeline
pipe = Pipeline([('scl', StandardScaler()),
('pca', PCA(n_components=2)),
('clf', tree.DecisionTreeClassifier(random_state=42))])
# Fit the pipeline
pipe.fit(X_train, y_train)
# Pipeline test accuracy
print('Test accuracy: %.3f' % pipe.score(X_test, y_test))
# Pipeline estimator params; estimator is stored as step 3 ([2]), second item ([1])
print('\nModel hyperparameters:\n', pipe.steps[2][1].get_params())
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
np.random.seed(0)
iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])
le = LabelEncoder()
iris['label'] = le.fit_transform(iris['label'])
X = np.array(iris.drop(['label'], axis=1))
y = np.array(iris['label'])
y = to_categorical(y, num_classes=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(iris.head())
def c_model():
model = Sequential()
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
# model = KerasClassifier(build_fn=c_model, epochs=2, batch_size=32)
# model.fit(X_train, y_train)
# p_test = model.predict_proba(X_test)
# out = pd.DataFrame(p_test)
# out.to_csv('test1_iris.csv')
# Grid Search
# model = KerasClassifier(build_fn=c_model)
# batch_sizes = [10, 20, 50, 100]
# epochs = [5, 10, 50]
# parameters = {'batch_size': batch_sizes, 'epochs': epochs}
# clf = GridSearchCV(model, parameters)
# clf.fit(X_train, y_train)
# print(clf.best_score_, clf.best_params_)
# means = clf.cv_results_['mean_test_score']
# parameters = clf.cv_results_['params']
# for mean, parammeter in zip(means, parameters):
# print(mean, parammeter)
def c_model(optimizer):
model = Sequential()
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
model = KerasClassifier(build_fn=c_model, epochs=1, batch_size=32)
parameters = {'optimizer':['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']}
clf = GridSearchCV(model, parameters)
clf.fit(X_train, y_train)
# TUNE ACTIVATION FUNCTION
print(clf.best_score_, clf.best_params_)
means = clf.cv_results_['mean_test_score']
parameters = clf.cv_results_['params']
for mean, parammeter in zip(means, parameters):
print(mean, parammeter)
# def c_model(activation):
# model = Sequential()
# model.add(Dense(32, activation=activation))
# model.add(Dense(16, activation=activation))
# model.add(Dense(3, activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# return model
# model = KerasClassifier(build_fn=c_model, epochs=50, batch_size=32)
# parameters = {'activation':['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']}
# clf = GridSearchCV(model, parameters)
# clf.fit(X_train, y_train)
# print(clf.best_score_, clf.best_params_)
# means = clf.cv_results_['mean_test_score']
# parameters = clf.cv_results_['params']
# for mean, parammeter in zip(means, parameters):
# print(mean, parammeter)