i can provide the graphic or code, and i need sb. to analysis the data for me
there are two datasets that you need to analysis for, here are the link where you can find the data
Seed data:https://archive.ics.uci.edu/ml/datasets/seeds
Automobile data:https://archive.ics.uci.edu/ml/datasets/Automobile
Topics for project:
1) How to compare multiple labels with respect to one single feature? Each label is attached to a 1-dim dataset of feature measurements. Datasets: Seed.
2) How to see intrinsic differences among multiple labels with respect to multiple features? Each label is attached to a K-dim dataset of feature measurements. Datasets: Seed.
3) How to deal with categorical features? Dataset: Automobile.
4) How to measure associative relations between a categorical response variable and multiple covariate features. Datasets: Seedand Automobile datasets.
here are the code: you may run this in JupyterLab in order to see the graphic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.distributions.empirical_distribution import ECDF
import seaborn as sns
from matplotlib.pyplot import cm
from pyitlib import discrete_random_variable as drv
from plotnine import *
from statistics import *
from collections import Counter
import warnings
warnings.filterwarnings(“ignore”)
seed_df = pd.read_csv(‘seeds_dataset.txt’, delim_whitespace=True, header = None)
seed_df.columns = [“area”, “perimeter”, “compactness”, “length of kernel”, ” width of kernel”, “asymmetry coefficient”, “length of kernel groove”, “wheat”]
seed_df.corr()
sns.heatmap(seed_df.iloc[:,0:7].corr())
sns.clustermap(seed_df.iloc[:,0:7].corr())
fig, axs = plt.subplots(7, 3, figsize=(15, 10))
fig.subplots_adjust(top = 2.5, bottom = 0.5, wspace = 0.3)
axs = axs.ravel()
for i in range(1, len(seed_df.columns)):
ecdf = ECDF(seed_df.iloc[:,i-1])
x = np.linspace(min(seed_df.iloc[:,i-1]), max(seed_df.iloc[:,i-1]))
y = ecdf(x)
axs[3*i-3].step(x, y)
axs[3*i-3].set_title(f”Empirical CDF for {seed_df.columns[i-1]}”)
seed_df.pivot_table(values=seed_df.columns[i-1], index=seed_df.index, columns=[‘wheat’]).plot.hist(bins=50, stacked=True, ax=axs[3*i-2])
axs[3*i-2].set_title(f”Gapped Histogram for {seed_df.columns[i-1]}”)
axs[3*i-1].plot(seed_df.iloc[:,i-1])
axs[3*i-1].set_title(f”Line plot for {seed_df.columns[i-1]}”)
sns.pairplot(seed_df, hue = “wheat”)
def entropy(Y):
unique, count = np.unique(Y, return_counts=True, axis=0)
prob = count/len(Y)
en = np.sum((-1)*prob*np.log2(prob))
return en
#Joint Entropy
# H(Y;X)
def jEntropy(Y,X):
YX = np.c_[Y,X]
return entropy(YX)
#Conditional Entropy
## conditional entropy = Joint Entropy – Entropy of X
## H(Y|X) = H(Y;X) – H(X)
def cEntropy(Y, X):
return jEntropy(Y, X) – entropy(X)
#Mutual Information
#Mutual Information, I(Y;X) = H(Y) – H(Y|X)
def mutual_info(Y, X):
return entropy(Y) – cEntropy(Y,X)
feature_selection.mutual_info_classif(seed_df.iloc[:,0:7], seed_df.iloc[:,-1], discrete_features=’auto’)
def mutualy_table(df):
n = len(df.columns) – 1
ce_df = pd.DataFrame(np.zeros((n, n)))
for i in range(0, n):
for j in range(0, n):
ce_df[i][j] = mutual_info(seed_df.iloc[:, i], seed_df.iloc[:, j])
ce_df.columns = df.columns[:-1]
ce_df.index = df.columns[:-1]
return ce_df
print(“Mutual Information table is:”)
mutualy_table(seed_df)
auto_df = pd.read_csv(‘imports-85.data’, header = None)
auto_df.columns =[“symboling”, “normalized-losses”, “make”, “fuel-type”, “aspiration”, “num-of-doors”, “body-style”, “drive-wheels”,
“engine-location”, “wheel-base”, “length”, “width”, “height”, “curb-weight”, “engine-type”, “num-of-cylinders”,
“engine-size”, “fuel-system”, “bore”, “stroke”, “compression-ratio”, “horsepower”, “peak-rpm”, “city-mpg”, “highway-mpg”, “price”]
def count_na(df):
for col in df.columns:
l = df[df[col] == “?”].shape[0]
if l != 0:
print(f”{col} has {l} missing value”)
# which is same as the document said
count_na(auto_df)
def replace_na(df):
df_new = df.copy()
for col in [“normalized-losses”, “bore”, “stroke”, “horsepower”, “peak-rpm”, “price”]:
i = df[df[col] == “?”].index
j = list(set(range(0, df[col].shape[0])) – set(i))
df_new.loc[i, col] = mean(pd.to_numeric(df.loc[j, col]))
for col in [“num-of-doors”]:
i = df[df[col] == “?”].index
j = list(set(range(0, df[col].shape[0])) – set(i))
df_new.loc[i, col] = max(Counter(df.loc[j, col]))
return(df_new)
auto_df_new = replace_na(auto_df)
auto_df_new
def change_dtype(df):
l1 = [3, 4, 5, 6, 7, 8, 9, 15, 16, 18]
for i in l1:
df.iloc[:, i-1] = pd.Categorical(df.iloc[:, i-1])
l2 = list(set(range(1, 27)) – set(l1) – set([1]))
for j in l2:
df.iloc[:, j-1] = pd.to_numeric(df.iloc[:, j-1])
return df
change_dtype(auto_df_new)
auto_df_new_dummy = pd.get_dummies(auto_df_new)
sns.pairplot(auto_df, hue = “symboling”)
the attachments are the reference report example and the project template


0 comments