Commit 3cf41f86 authored by Andrew Hrdy's avatar Andrew Hrdy

Initial commit

parents
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
pylint = "*"
black = "*"
[packages]
keras = "*"
numpy = "*"
sklearn = "*"
tensorflow-gpu = "*"
scipy = "*"
imbalanced-learn = "*"
[requires]
python_version = "3.7"
[pipenv]
allow_prereleases = true
This diff is collapsed.
HW2 - Drug Activity Prediction - Andrew Hrdy <G01012745>
Files:
- main.py will run the entire algorithm on all training and testing data
- test.py will run k-fold validation on the algorithm and print out the F1-Score and confusion matrix
- utils.py contains various utility files such as parsing the data and converting Dr. Rangwala's sparse format to a Scipy CSR matrix
- Files found in the `classifiers` directory are the various classifiers I tried. They can be swapped out in main.py and test.py by replacing
the classifier on lines 30 and 33, respectively.
- Files found in the `dim_reduction` directory are the different dimensionality reduction techniques I tried. They can also be swapped out in main.py
and test.py by replacing the dim_reducer on lines 21 and 27, respectively.
Running:
- To download the dependencies, run `pipenv install`
- To run the project run `pipenv shell` then `py main.py`
- Note: The Tensorflow dependency uses Tensorflow-GPU
Submissions:
- All submissions can be found in `./out`. `HW2_Hrdy_sub8.txt` is the final submission.
\ No newline at end of file
from sklearn.tree import DecisionTreeClassifier
class decision_tree:
def fit(self, x, y):
# Varying properties of the decision tree were tested. Gini index was superior than entropy.
# In addition, the number of nodes required to be considered a leaf node was changed, but this
# had little effect on the F1-Score
self.tree = DecisionTreeClassifier(criterion="gini")
self.tree.fit(x, y)
def predict(self, x):
return self.tree.predict(x)
\ No newline at end of file
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
class naive_bayes:
def fit(self, x, y):
# Both Bernoulli and Gaussian distributions were tested. The Gaussian
# distribution performed very poorly and had F1-Scores under .2. Bernoulli
# was a close contender to the Decision Tree, but could not seem to out-perform it.
self.nb = BernoulliNB()
self.nb.fit(x, y)
def predict(self, x):
return self.nb.predict(x)
import tensorflow as tf
class neural_network:
def __init__(self, input_dims):
self.input_dims = input_dims
def fit(self, x, y):
m = tf.keras.Sequential()
m.add(tf.keras.layers.Dense(units=self.input_dims/2, input_shape=(self.input_dims,), activation="relu"))
# A variety of additional layers were added here (between 0 and 10) during testing. With all of these
# layers, the neural network did not perform better than the decision tree.
m.add(tf.keras.layers.Dropout(0.5))
m.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
m.compile(optimizer="adam", loss="binary_crossentropy")
m.fit(x=x, y=y, epochs=1000)
self.predictor = m
def predict(self, x):
return self.predictor.predict_classes(x)
\ No newline at end of file
from sklearn.svm import SVC
class svm:
def fit(self, x, y):
# Linear and RBF were tested with varying C values.
# Linear was much more effective. RBF was only somewhat
# effective when the value of C was increased, but could
# not compete with linear.
self.svc = SVC(kernel="linear", C=1)
self.svc.fit(x, y)
def predict(self, x):
return self.svc.predict(x)
\ No newline at end of file
import tensorflow as tf
class autocoder:
def __init__(self, input_size: int, hidden_size: int, out_dims: int):
self.input_size = input_size
self.hidden_size = hidden_size
self.out_dims = out_dims
def fit(self, x):
# A neural network that maps X to X using 3 hidden layers.
# The network is symmetric about the reduced_dim_layer. Going from
# the input layer to the reduced dim layer encodes the data to the
# lower dimension, while going from the reduced_dim_layer to the output_layer
# deoces it back to the original value. After the neural network is trained,
# the decoder can be ignored.
input_layer = tf.keras.layers.Input(shape=(self.input_size,))
hidden_in = tf.keras.layers.Dense(self.hidden_size, activation="relu")(input_layer)
reduced_dim_layer = tf.keras.layers.Dense(self.out_dims, activation="relu")(hidden_in)
hidden_out = tf.keras.layers.Dense(self.hidden_size, activation="relu")(reduced_dim_layer)
output_layer = tf.keras.layers.Dense(self.input_size, activation="sigmoid")(hidden_out)
autoencoder = tf.keras.models.Model(input_layer, output_layer)
encoder = tf.keras.models.Model(input_layer, reduced_dim_layer)
autoencoder.compile(optimizer="adam", loss="binary_crossentropy")
autoencoder.fit(x, x, epochs=1000)
self.encoder = encoder
def transform(self, x):
return self.encoder.predict(x)
\ No newline at end of file
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
import numpy as np
from scipy.sparse import csr_matrix, dok_matrix, issparse
from utils import convert_rangwala_to_csr
class svd:
def __init__(self, out_dims):
self.out_dims = out_dims
def fit(self, x):
# Use Single Value Decomposition for dimensionality reduction
svd = TruncatedSVD(self.out_dims, "randomized", random_state=5)
# Some of the tests cases may pass a scipy sparse matrix already, if that is
# the case, use it, otherwise convert the rangwala sparse to a csr.
svd.fit(x if issparse(x) else convert_rangwala_to_csr(x))
# Prints out the total variance (sum of the different components variance,
# there will be a total of out_dims components)
print(f'Variance = {svd.explained_variance_ratio_.sum()}')
self.svd = svd
def transform(self, x):
return self.svd.transform(x if issparse(x) else convert_rangwala_to_csr(x))
\ No newline at end of file
import numpy as np
from imblearn.over_sampling import SMOTE
from dim_reduction.svd import svd
from classifiers.decision_tree import decision_tree
from utils import convert_rangwala_to_csr, parse_train_file, parse_test_file
train_file = "./resources/1568646872_6486642_train_drugs.data"
test_file = "./resources/1568646872_659426_test.data"
out_file = "./out/HW2_Hrdy_sub18.txt"
# Parse the training and testing files
input_features, input_classes = parse_train_file(train_file)
test_features = parse_test_file(test_file)
# Use SMOTE to synthetically generate more data
x_train_resampled, y_train_resampled = SMOTE().fit_resample(convert_rangwala_to_csr(input_features), input_classes)
# Note: The number of output dims can be greater than 800 (size of original training data)
# because after SMOTE the total number of "samples" is greater than 800
dim_reducer = svd(803)
dim_reducer.fit(x_train_resampled)
# Transform the X data for both the training and test case to the reduced dimensions
x_train_transformed = dim_reducer.transform(x_train_resampled)
x_test_transformed = dim_reducer.transform(test_features)
# The following line can be replaced to use any of the classifiers in the classifiers directory
# in order to test them.
classifier = decision_tree()
classifier.fit(x_train_transformed, y_train_resampled)
y_pred = classifier.predict(x_test_transformed)
# Write the predictions to the output file.
with open(out_file, "w") as out_file:
for pred in y_pred:
out_file.write(str(pred) + "\n")
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
1
1
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
1
1
1
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from dim_reduction.svd import svd
from classifiers.decision_tree import decision_tree
from utils import convert_rangwala_to_csr, parse_train_file
train_file = "./resources/1568646872_6486642_train_drugs.data"
# Parse the training file
input_features, input_classes = parse_train_file(train_file)
# Use a stratified k-fold to ensure the data is split evenly due to unbalanced data
kf = StratifiedKFold(n_splits=5, random_state=5)
for train_index, test_index in kf.split(input_features, input_classes):
x_train, x_test = input_features[train_index], input_features[test_index]
y_train, y_test = input_classes[train_index], input_classes[test_index]
# Use SMOTE to synthetically generate more data
x_train_resampled, y_train_resampled = SMOTE().fit_resample(convert_rangwala_to_csr(x_train), y_train)
dim_reducer = svd(600)
dim_reducer.fit(x_train_resampled)
x_train_transformed = dim_reducer.transform(x_train_resampled)
x_test_transformed = dim_reducer.transform(x_test)
classifier = decision_tree()
classifier.fit(x_train_transformed, y_train_resampled)
y_pred = classifier.predict(x_test_transformed)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
\ No newline at end of file
import tensorflow as tf
import numpy as np
from scipy.sparse import csr_matrix
# Convets a sparse matrix in the format Dr. Rangwala gave to a Scipy CSR matrix
def convert_rangwala_to_csr(rang_sparse):
indptr = [0]
indices = []
for line in rang_sparse:
for j in line:
indices.append(j)
indptr.append(len(indices))
data = np.array([1] * len(indices))
return csr_matrix((data, np.array(indices), np.array(indptr)), shape=(len(rang_sparse), 100001), dtype=np.int32)
# Convets a sparse matrix in the format Dr. Rangwala gave to a Tensorflow tensor in sparse format
def convert_rangwala_to_tensor(rang_sparse):
tensor_indices = []
tensor_dense_shape = []
for i, line in enumerate(rang_sparse):
for j in line:
tensor_indices.append([i, j])
tensor_values = [1] * len(tensor_indices)
tensor_dense_shape = [len(rang_sparse), max([j for i in rang_sparse for j in i]) + 1]
return tf.sparse.to_dense(tf.SparseTensor(indices=tensor_indices, values=tensor_values, dense_shape=tensor_dense_shape))
# Parses the training file
def parse_train_file(train_file_path):
with open(train_file_path, "r") as train_file:
lines = [[int(x) for x in line.split()] for line in train_file.readlines()]
# An array of features and classes where the ith values in each array are corresponding
input_features = []
input_classes = []
for i in range(len(lines)):
input_features.append(lines[i][1:])
input_classes.append(lines[i][0])
# Convert the features and classes array to a numpy array for easier handling
input_features = np.array(input_features)
input_classes = np.array(input_classes)
return (input_features, input_classes)
# Parses the testing file
def parse_test_file(test_file_path):
with open(test_file_path, "r") as test_file:
return [[int(x) for x in line.split()] for line in test_file.readlines()]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment