data science report

0 comments

i can provide the graphic or code, and i need sb. to analysis the data for me

there are two datasets that you need to analysis for, here are the link where you can find the data

Seed data:https://archive.ics.uci.edu/ml/datasets/seeds

Automobile data:https://archive.ics.uci.edu/ml/datasets/Automobile

Topics for project:

1) How to compare multiple labels with respect to one single feature? Each label is attached to a 1-dim dataset of feature measurements. Datasets: Seed.

2) How to see intrinsic differences among multiple labels with respect to multiple features? Each label is attached to a K-dim dataset of feature measurements. Datasets: Seed.

3) How to deal with categorical features? Dataset: Automobile.

4) How to measure associative relations between a categorical response variable and multiple covariate features. Datasets: Seedand Automobile datasets.


here are the code: you may run this in JupyterLab in order to see the graphic

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from statsmodels.distributions.empirical_distribution import ECDF

import seaborn as sns

from matplotlib.pyplot import cm

from pyitlib import discrete_random_variable as drv

from plotnine import *

from statistics import *

from collections import Counter

import warnings

warnings.filterwarnings(“ignore”)

seed_df = pd.read_csv(‘seeds_dataset.txt’, delim_whitespace=True, header = None)

seed_df.columns = [“area”, “perimeter”, “compactness”, “length of kernel”, ” width of kernel”, “asymmetry coefficient”, “length of kernel groove”, “wheat”]

seed_df.corr()

sns.heatmap(seed_df.iloc[:,0:7].corr())

sns.clustermap(seed_df.iloc[:,0:7].corr())

fig, axs = plt.subplots(7, 3, figsize=(15, 10))

fig.subplots_adjust(top = 2.5, bottom = 0.5, wspace = 0.3)

axs = axs.ravel()

for i in range(1, len(seed_df.columns)):

ecdf = ECDF(seed_df.iloc[:,i-1])

x = np.linspace(min(seed_df.iloc[:,i-1]), max(seed_df.iloc[:,i-1]))

y = ecdf(x)

axs[3*i-3].step(x, y)

axs[3*i-3].set_title(f”Empirical CDF for {seed_df.columns[i-1]}”)

seed_df.pivot_table(values=seed_df.columns[i-1], index=seed_df.index, columns=[‘wheat’]).plot.hist(bins=50, stacked=True, ax=axs[3*i-2])

axs[3*i-2].set_title(f”Gapped Histogram for {seed_df.columns[i-1]}”)

axs[3*i-1].plot(seed_df.iloc[:,i-1])

axs[3*i-1].set_title(f”Line plot for {seed_df.columns[i-1]}”)

sns.pairplot(seed_df, hue = “wheat”)

def entropy(Y):

unique, count = np.unique(Y, return_counts=True, axis=0)

prob = count/len(Y)

en = np.sum((-1)*prob*np.log2(prob))

return en

#Joint Entropy

# H(Y;X)

def jEntropy(Y,X):

YX = np.c_[Y,X]

return entropy(YX)

#Conditional Entropy

## conditional entropy = Joint Entropy – Entropy of X

## H(Y|X) = H(Y;X) – H(X)

def cEntropy(Y, X):

return jEntropy(Y, X) – entropy(X)

#Mutual Information

#Mutual Information, I(Y;X) = H(Y) – H(Y|X)

def mutual_info(Y, X):

return entropy(Y) – cEntropy(Y,X)

feature_selection.mutual_info_classif(seed_df.iloc[:,0:7], seed_df.iloc[:,-1], discrete_features=’auto’)

def mutualy_table(df):

n = len(df.columns) – 1

ce_df = pd.DataFrame(np.zeros((n, n)))

for i in range(0, n):

for j in range(0, n):

ce_df[i][j] = mutual_info(seed_df.iloc[:, i], seed_df.iloc[:, j])

ce_df.columns = df.columns[:-1]

ce_df.index = df.columns[:-1]

return ce_df

print(“Mutual Information table is:”)

mutualy_table(seed_df)

auto_df = pd.read_csv(‘imports-85.data’, header = None)

auto_df.columns =[“symboling”, “normalized-losses”, “make”, “fuel-type”, “aspiration”, “num-of-doors”, “body-style”, “drive-wheels”,

“engine-location”, “wheel-base”, “length”, “width”, “height”, “curb-weight”, “engine-type”, “num-of-cylinders”,

“engine-size”, “fuel-system”, “bore”, “stroke”, “compression-ratio”, “horsepower”, “peak-rpm”, “city-mpg”, “highway-mpg”, “price”]

def count_na(df):

for col in df.columns:

l = df[df[col] == “?”].shape[0]

if l != 0:

print(f”{col} has {l} missing value”)

# which is same as the document said

count_na(auto_df)

def replace_na(df):

df_new = df.copy()

for col in [“normalized-losses”, “bore”, “stroke”, “horsepower”, “peak-rpm”, “price”]:

i = df[df[col] == “?”].index

j = list(set(range(0, df[col].shape[0])) – set(i))

df_new.loc[i, col] = mean(pd.to_numeric(df.loc[j, col]))

for col in [“num-of-doors”]:

i = df[df[col] == “?”].index

j = list(set(range(0, df[col].shape[0])) – set(i))

df_new.loc[i, col] = max(Counter(df.loc[j, col]))

return(df_new)

auto_df_new = replace_na(auto_df)

auto_df_new

def change_dtype(df):

l1 = [3, 4, 5, 6, 7, 8, 9, 15, 16, 18]

for i in l1:

df.iloc[:, i-1] = pd.Categorical(df.iloc[:, i-1])

l2 = list(set(range(1, 27)) – set(l1) – set([1]))

for j in l2:

df.iloc[:, j-1] = pd.to_numeric(df.iloc[:, j-1])

return df

change_dtype(auto_df_new)

auto_df_new_dummy = pd.get_dummies(auto_df_new)

sns.pairplot(auto_df, hue = “symboling”)

the attachments are the reference report example and the project template

About the Author

Follow me


{"email":"Email address invalid","url":"Website address invalid","required":"Required field missing"}