首页 > 其他 > 详细

Classification and Prediction

时间:2017-02-14 20:23:39      阅读:367      评论:0      收藏:0      [点我收藏+]
# coding: utf-8

# In[128]:

get_ipython().magic(umatplotlib inline)
import pandas as pd
from pandas import Series,DataFrame
import seaborn as sns
sns.set_style(whitegrid)
pd.set_option(display.mpl_style, default)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

train_df= pd.read_csv("/home/lpstudy/下载/train.csv")
test_df = pd.read_csv("/home/lpstudy/下载/test.csv")

train_df.head()  


test_df.head()


# In[129]:

train_df = train_df.drop(["Ticket","PassengerId","Name"],axis = 1)
test_df = test_df.drop(["Name","Ticket"],axis =1)


# In[130]:

train_df.head()


# In[131]:

train_df["Embarked"] = train_df["Embarked"].fillna("S")
#plot
sns.factorplot("Embarked","Survived",data = train_df,size = 6,aspect = 2)


fig,(axis1,axis2,axis3) = plt.subplots(1,3,figsize = (15,5))

sns.countplot(x=Embarked, data=train_df, ax=axis1)
sns.countplot(x=Survived, hue="Embarked", data=train_df, order=[1,0], ax=axis2)

embark_perc = train_df[["Embarked", "Survived"]].groupby([Embarked],as_index=False).mean()
sns.barplot(x=Embarked, y=Survived, data=embark_perc,order=[S,C,Q],ax=axis3)


embark_dummies_train  = pd.get_dummies(train_df[Embarked])
embark_dummies_train.drop([S], axis=1, inplace=True)

embark_dummies_test  = pd.get_dummies(test_df[Embarked])
embark_dummies_test.drop([S], axis=1, inplace=True)

train_df = train_df.join(embark_dummies_train)
test_df    = test_df.join(embark_dummies_test)

train_df.drop([Embarked], axis=1,inplace=True)
test_df.drop([Embarked], axis=1,inplace=True)


# In[132]:

test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True)

train_df[Fare] = train_df[Fare].astype(int)
test_df[Fare]    = test_df[Fare].astype(int)

fare_not_survived = train_df["Fare"][train_df["Survived"] == 0]
fare_survived     = train_df["Fare"][train_df["Survived"] == 1]

avgerage_fare = DataFrame([fare_not_survived.mean(), fare_survived.mean()])
std_fare      = DataFrame([fare_not_survived.std(), fare_survived.std()])

#plot
train_df[Fare].plot(kind=hist, figsize=(15,3),bins=100, xlim=(0,50))

avgerage_fare.index.names = std_fare.index.names = ["Survived"]
avgerage_fare.plot(yerr=std_fare,kind=bar,legend=False)


# In[133]:



# Age 

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
axis1.set_title(Original Age values - Titanic)
axis2.set_title(New Age values - Titanic)

average_age_titanic   = train_df["Age"].mean()
std_age_titanic       = train_df["Age"].std()
count_nan_age_titanic = train_df["Age"].isnull().sum()


# get average, std, and number of NaN values in test_df
average_age_test   = test_df["Age"].mean()
std_age_test       = test_df["Age"].std()
count_nan_age_test = test_df["Age"].isnull().sum()

# generate random numbers between (mean - std) & (mean + std)
rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)

# plot original Age values
# NOTE: drop all null values, and convert to int
train_df[Age].dropna().astype(int).hist(bins=70, ax=axis1)
# test_df[‘Age‘].dropna().astype(int).hist(bins=70, ax=axis1)

# fill NaN values in Age column with random values generated
train_df["Age"][np.isnan(train_df["Age"])] = rand_1
test_df["Age"][np.isnan(test_df["Age"])] = rand_2

# convert from float to int
train_df[Age] = train_df[Age].astype(int)
test_df[Age]    = test_df[Age].astype(int)
        
# plot new Age Values
train_df[Age].hist(bins=70, ax=axis2)
# test_df[‘Age‘].hist(bins=70, ax=axis4)


# In[134]:

# .... continue with plot Age column

# peaks for survived/not survived passengers by their age
facet = sns.FacetGrid(train_df, hue="Survived",aspect=4)
facet.map(sns.kdeplot,Age,shade= True)
facet.set(xlim=(0, train_df[Age].max()))
facet.add_legend()

# average survived passengers by age
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_age = train_df[["Age", "Survived"]].groupby([Age],as_index=False).mean()
sns.barplot(x=Age, y=Survived, data=average_age)


# In[135]:

# Cabin
# It has a lot of NaN values, so it won‘t cause a remarkable impact on prediction
train_df.drop("Cabin",axis=1,inplace=True)
test_df.drop("Cabin",axis=1,inplace=True)

# Family

# Instead of having two columns Parch & SibSp, 
# we can have only one column represent if the passenger had any family member aboard or not,
# Meaning, if having any family member(whether parent, brother, ...etc) will increase chances of Survival or not.
train_df[Family] =  train_df["Parch"] + train_df["SibSp"]
train_df[Family].loc[train_df[Family] > 0] = 1
train_df[Family].loc[train_df[Family] == 0] = 0

test_df[Family] =  test_df["Parch"] + test_df["SibSp"]
test_df[Family].loc[test_df[Family] > 0] = 1
test_df[Family].loc[test_df[Family] == 0] = 0

# drop Parch & SibSp
train_df = train_df.drop([SibSp,Parch], axis=1)
test_df    = test_df.drop([SibSp,Parch], axis=1)

# plot
fig, (axis1,axis2) = plt.subplots(1,2,sharex=True,figsize=(10,5))

# sns.factorplot(‘Family‘,data=train_df,kind=‘count‘,ax=axis1)
sns.countplot(x=Family, data=train_df, order=[1,0], ax=axis1)

# average of survived for those who had/didn‘t have any family member
family_perc = train_df[["Family", "Survived"]].groupby([Family],as_index=False).mean()
sns.barplot(x=Family, y=Survived, data=family_perc, order=[1,0], ax=axis2)

axis1.set_xticklabels(["With Family","Alone"], rotation=0)


# In[136]:

# Sex

# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as males, females, and child
def get_person(passenger):
    age,sex = passenger
    return child if age < 16 else sex
    
train_df[Person] = train_df[[Age,Sex]].apply(get_person,axis=1)
test_df[Person]    = test_df[[Age,Sex]].apply(get_person,axis=1)

# No need to use Sex column since we created Person column
train_df.drop([Sex],axis=1,inplace=True)
test_df.drop([Sex],axis=1,inplace=True)

# create dummy variables for Person column, & drop Male as it has the lowest average of survived passengers
person_dummies_titanic  = pd.get_dummies(train_df[Person])
person_dummies_titanic.columns = [Child,Female,Male]
person_dummies_titanic.drop([Male], axis=1, inplace=True)

person_dummies_test  = pd.get_dummies(test_df[Person])
person_dummies_test.columns = [Child,Female,Male]
person_dummies_test.drop([Male], axis=1, inplace=True)

train_df = train_df.join(person_dummies_titanic)
test_df    = test_df.join(person_dummies_test)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(10,5))

# sns.factorplot(‘Person‘,data=train_df,kind=‘count‘,ax=axis1)
sns.countplot(x=Person, data=train_df, ax=axis1)

# average of survived for each Person(male, female, or child)
person_perc = train_df[["Person", "Survived"]].groupby([Person],as_index=False).mean()
sns.barplot(x=Person, y=Survived, data=person_perc, ax=axis2, order=[male,female,child])

train_df.drop([Person],axis=1,inplace=True)
test_df.drop([Person],axis=1,inplace=True)


# In[137]:

# Pclass

# sns.factorplot(‘Pclass‘,data=train_df,kind=‘count‘,order=[1,2,3])
sns.factorplot(Pclass,Survived,order=[1,2,3], data=train_df,size=5)

# create dummy variables for Pclass column, & drop 3rd class as it has the lowest average of survived passengers
pclass_dummies_titanic  = pd.get_dummies(train_df[Pclass])
pclass_dummies_titanic.columns = [Class_1,Class_2,Class_3]
pclass_dummies_titanic.drop([Class_3], axis=1, inplace=True)

pclass_dummies_test  = pd.get_dummies(test_df[Pclass])
pclass_dummies_test.columns = [Class_1,Class_2,Class_3]
pclass_dummies_test.drop([Class_3], axis=1, inplace=True)

train_df.drop([Pclass],axis=1,inplace=True)
test_df.drop([Pclass],axis=1,inplace=True)

train_df = train_df.join(pclass_dummies_titanic)
test_df    = test_df.join(pclass_dummies_test)


# In[139]:

# define training and testing sets

X_train = train_df.drop("Survived",axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId",axis=1).copy()



# In[140]:

# Logistic Regression

logreg = LogisticRegression()

logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

logreg.score(X_train, Y_train)


# In[141]:

# Support Vector Machines

svc = SVC()

svc.fit(X_train, Y_train)

Y_pred = svc.predict(X_test)

svc.score(X_train, Y_train)



# In[142]:

# Random Forests

random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)


# In[143]:



# get Correlation Coefficient for each feature using Logistic Regression
coeff_df = DataFrame(train_df.columns.delete(0))
coeff_df.columns = [Features]
coeff_df["Coefficient Estimate"] = pd.Series(logreg.coef_[0])

# preview
coeff_df



# In[ ]:

 

Classification and Prediction

原文:http://www.cnblogs.com/lpworkstudyspace1992/p/6399063.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!