In [175]:

import numpy as np

import pandas as pd

import seaborn as sb

import matplotlib.pyplot as plt

import statsmodels.api as sm

from sklearn.model_selection import train_test_split

In [176]:

df=pd.read_csv('G:/qsar.csv')

In [177]:

cl=['SMILES','KOW type','Name']

df.drop(cl,inplace=True,axis=1)

In [178]:

df.tail()

df.to_csv('ass1.csv')

In [179]:

df=pd.read_csv('ass1.csv')

In [180]:

df.drop(columns=['Unnamed: 0'],inplace=True)

In [181]:

df.isnull().sum()

df.info()

RangeIndex: 1058 entries, 0 to 1057

Data columns (total 3 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 CAS 1058 non-null object

1 LogKOW 1056 non-null object

2 logBCF 1056 non-null float64

dtypes: float64(1), object(2)

memory usage: 24.9+ KB

In [182]:

dfn = df.convert_dtypes()

dfn.dtypes

Out[182]:

CAS string

LogKOW string

logBCF float64

dtype: object

In [183]:

df.at[1050:1058,'LogKOW']=np.NaN

df['LogKOW']=pd.to_numeric(df['LogKOW'],errors='coerce')

df.describe()

Out[183]:

	LogKOW	logBCF
count	1046.000000	1056.000000
mean	3.839512	2.031780
std	2.242551	1.362284
min	-4.000000	-2.480000
25%	2.375000	0.910000
50%	3.790000	2.000000
75%	5.230000	3.020000
max	18.350000	6.060000

In [184]:

df['LogKOW'].fillna(df['LogKOW'].mean(),inplace=True)

df['logBCF'].fillna(df['logBCF'].mean(),inplace=True)

In [185]:

df.isnull().sum()

Out[185]:

CAS 0

LogKOW 0

logBCF 0

dtype: int64

In [186]:

df.drop(['CAS'],inplace=True,axis=1)

In [187]:

df.dtypes

Out[187]:

LogKOW float64

logBCF float64

dtype: object

In [188]:

df['LogKOW']=((df['LogKOW']-df['LogKOW'].min())/(df['LogKOW'].max()-df['LogKOW'].min()))

df['logBCF']=((df['logBCF']-df['logBCF'].min())/(df['logBCF'].max()-df['logBCF'].min()))

df.head()

Out[188]:

	LogKOW	logBCF
0	0.281879	0.428571
1	0.266667	0.411007
2	0.267562	0.243560
3	0.224609	0.173302
4	0.312752	0.388759

In [189]:

df.head()

Out[189]:

	LogKOW	logBCF
0	0.281879	0.428571
1	0.266667	0.411007
2	0.267562	0.243560
3	0.224609	0.173302
4	0.312752	0.388759

In [190]:

sb.pairplot(df)

Out[190]:

<seaborn.axisgrid.PairGrid at 0x1e357659388>

In [191]:

sb.heatmap(df.corr(),annot=True)

Out[191]:

<matplotlib.axes._subplots.AxesSubplot at 0x1e3571b10c8>

In [192]:

df.to_csv('ass2.csv')

In [193]:

x=df['LogKOW'].values[:,None]

y=df['logBCF'].values[:,None]

In [194]:

Training DATAset:

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [195]:

print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(846, 1) (212, 1) (846, 1) (212, 1)

In [196]:

from sklearn.linear_model import LinearRegression

from scipy.stats import pearsonr

lm=LinearRegression()

m=lm.fit(x,y)

print(m.coef_,m.intercept_)

[[1.05578626]] [0.15798267]

In [197]:

y_test=lm.predict(x_test)

In [198]:

plt.scatter(x,y,color='b')

plt.plot(x_test,y_test,color='black',linewidth=3)

plt.xlabel('Height in inches')

plt.ylabel('Weigth in Pounds')

plt.show()

MOdel PErfromance¶

In [199]:

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [200]:

y_train_pred=lm.predict(x_train).ravel()

y_test_pred=lm.predict(x_test).ravel()

In [201]:

from sklearn.metrics import mean_squared_error as mse,r2_score

from sklearn.metrics import mean_absolute_error as mae

In [202]:

print("The Mean Squared Error on Train set is:\t{:0.1f}".format(mse(y_train,y_train_pred)))

print("The Mean Squared Error on Test set is:\t{:0.1f}".format(mse(y_test,y_test_pred)))

print("The Mean Average Error on Train set is:\t{:0.1f}".format(mae(y_train,y_train_pred)))

print("The Mean Average Error on Train set is:\t{:0.1f}".format(mae(y_test,y_test_pred)))

The Mean Squared Error on Train set is: 0.0

The Mean Squared Error on Test set is: 0.0

The Mean Average Error on Train set is: 0.1

In [203]:

print("The R2 score on the Train set is:\t{:0.1f}".format(r2_score(y_train,y_train_pred)))

print("The R2 score on the Test set is:\t{:0.1f}".format(r2_score(y_test,y_test_pred)))

The R2 score on the Train set is: 0.4

The R2 score on the Test set is: 0.4

ACCURACY TEST:

from sklearn.model_selection import cross_val_score accuracy=cross_val_score(lm,x,y,cv=10,scoring='r2') accuracy

array([0.1348827 , 0.20549326, 0.42319296, 0.38670252, 0.22786572, 0.41307769, 0.57789581, 0.51368184, 0.47863654, 0.3871694 ])