In [175]:
1
import numpy as np
2
import pandas as pd
3
import seaborn as sb
4
import matplotlib.pyplot as plt
5
import statsmodels.api as sm
6
from sklearn.model_selection import train_test_split
In [176]:
1
df=pd.read_csv('G:/qsar.csv')
In [177]:
1
cl=['SMILES','KOW type','Name']
2
df.drop(cl,inplace=True,axis=1)
In [178]:
1
df.tail()
2
df.to_csv('ass1.csv')
In [179]:
1
df=pd.read_csv('ass1.csv')
In [180]:
1
df.drop(columns=['Unnamed: 0'],inplace=True)
In [181]:
1
df.isnull().sum()
2
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1058 entries, 0 to 1057
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CAS 1058 non-null object
1 LogKOW 1056 non-null object
2 logBCF 1056 non-null float64
dtypes: float64(1), object(2)
memory usage: 24.9+ KB
In [182]:
1
dfn = df.convert_dtypes()
2
dfn.dtypes
Out[182]:
CAS string
LogKOW string
logBCF float64
dtype: object
In [183]:
1
df.at[1050:1058,'LogKOW']=np.NaN
2
df['LogKOW']=pd.to_numeric(df['LogKOW'],errors='coerce')
3
df.describe()
Out[183]:
LogKOW | logBCF | |
count | 1046.000000 | 1056.000000 |
mean | 3.839512 | 2.031780 |
std | 2.242551 | 1.362284 |
min | -4.000000 | -2.480000 |
25% | 2.375000 | 0.910000 |
50% | 3.790000 | 2.000000 |
75% | 5.230000 | 3.020000 |
max | 18.350000 | 6.060000 |
In [184]:
1
df['LogKOW'].fillna(df['LogKOW'].mean(),inplace=True)
2
df['logBCF'].fillna(df['logBCF'].mean(),inplace=True)
In [185]:
1
df.isnull().sum()
Out[185]:
CAS 0
LogKOW 0
logBCF 0
dtype: int64
In [186]:
1
df.drop(['CAS'],inplace=True,axis=1)
In [187]:
1
df.dtypes
Out[187]:
LogKOW float64
logBCF float64
dtype: object
In [188]:
1
df['LogKOW']=((df['LogKOW']-df['LogKOW'].min())/(df['LogKOW'].max()-df['LogKOW'].min()))
2
df['logBCF']=((df['logBCF']-df['logBCF'].min())/(df['logBCF'].max()-df['logBCF'].min()))
3
df.head()
Out[188]:
LogKOW | logBCF | |
0 | 0.281879 | 0.428571 |
1 | 0.266667 | 0.411007 |
2 | 0.267562 | 0.243560 |
3 | 0.224609 | 0.173302 |
4 | 0.312752 | 0.388759 |
In [189]:
1
df.head()
Out[189]:
LogKOW | logBCF | |
0 | 0.281879 | 0.428571 |
1 | 0.266667 | 0.411007 |
2 | 0.267562 | 0.243560 |
3 | 0.224609 | 0.173302 |
4 | 0.312752 | 0.388759 |
In [190]:
1
sb.pairplot(df)
Out[190]:
<seaborn.axisgrid.PairGrid at 0x1e357659388>
In [191]:
1
sb.heatmap(df.corr(),annot=True)
Out[191]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e3571b10c8>
In [192]:
1
df.to_csv('ass2.csv')
In [193]:
1
x=df['LogKOW'].values[:,None]
2
y=df['logBCF'].values[:,None]
In [194]:
1
from sklearn.model_selection import train_test_split
2
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
In [195]:
1
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
2
(846, 1) (212, 1) (846, 1) (212, 1)
In [196]:
1
from sklearn.linear_model import LinearRegression
2
from scipy.stats import pearsonr
3
lm=LinearRegression()
4
m=lm.fit(x,y)
5
print(m.coef_,m.intercept_)
6
[[1.05578626]] [0.15798267]
In [197]:
1
y_test=lm.predict(x_test)
2
In [198]:
1
plt.scatter(x,y,color='b')
2
plt.plot(x_test,y_test,color='black',linewidth=3)
3
plt.xlabel('Height in inches')
4
plt.ylabel('Weigth in Pounds')
5
plt.show()
In [199]:
1
from sklearn.model_selection import train_test_split
2
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
In [200]:
1
y_train_pred=lm.predict(x_train).ravel()
2
y_test_pred=lm.predict(x_test).ravel()
In [201]:
1
from sklearn.metrics import mean_squared_error as mse,r2_score
2
from sklearn.metrics import mean_absolute_error as mae
In [202]:
1
print("The Mean Squared Error on Train set is:\t{:0.1f}".format(mse(y_train,y_train_pred)))
2
print("The Mean Squared Error on Test set is:\t{:0.1f}".format(mse(y_test,y_test_pred)))
3
print("The Mean Average Error on Train set is:\t{:0.1f}".format(mae(y_train,y_train_pred)))
4
print("The Mean Average Error on Train set is:\t{:0.1f}".format(mae(y_test,y_test_pred)))
The Mean Squared Error on Train set is: 0.0
The Mean Squared Error on Test set is: 0.0
The Mean Average Error on Train set is: 0.1
The Mean Average Error on Train set is: 0.1
In [203]:
1
print("The R2 score on the Train set is:\t{:0.1f}".format(r2_score(y_train,y_train_pred)))
2
print("The R2 score on the Test set is:\t{:0.1f}".format(r2_score(y_test,y_test_pred)))
The R2 score on the Train set is: 0.4
The R2 score on the Test set is: 0.4
from sklearn.model_selection import cross_val_score accuracy=cross_val_score(lm,x,y,cv=10,scoring='r2') accuracy
array([0.1348827 , 0.20549326, 0.42319296, 0.38670252, 0.22786572, 0.41307769, 0.57789581, 0.51368184, 0.47863654, 0.3871694 ])