In [175]:

1

import numpy as np

2

import pandas as pd

3

import seaborn as sb

4

import matplotlib.pyplot as plt

5

import statsmodels.api as sm

6

from sklearn.model_selection import train_test_split

In [176]:

1

df=pd.read_csv('G:/qsar.csv')

In [177]:

1

cl=['SMILES','KOW type','Name']

2

df.drop(cl,inplace=True,axis=1)

In [178]:

1

df.tail()

2

df.to_csv('ass1.csv')

In [179]:

1

df=pd.read_csv('ass1.csv')

In [180]:

1

df.drop(columns=['Unnamed: 0'],inplace=True)

In [181]:

1

df.isnull().sum()

2

df.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 1058 entries, 0 to 1057

Data columns (total 3 columns):

 #   Column  Non-Null Count  Dtype  

---  ------  --------------  -----  

 0   CAS     1058 non-null   object

 1   LogKOW  1056 non-null   object

 2   logBCF  1056 non-null   float64

dtypes: float64(1), object(2)

memory usage: 24.9+ KB

In [182]:

1

dfn = df.convert_dtypes()

2

dfn.dtypes

Out[182]:

CAS        string

LogKOW     string

logBCF    float64

dtype: object

In [183]:

1

df.at[1050:1058,'LogKOW']=np.NaN

2

df['LogKOW']=pd.to_numeric(df['LogKOW'],errors='coerce')

3

df.describe()

Out[183]:

LogKOW

logBCF

count

1046.000000

1056.000000

mean

3.839512

2.031780

std

2.242551

1.362284

min

-4.000000

-2.480000

25%

2.375000

0.910000

50%

3.790000

2.000000

75%

5.230000

3.020000

max

18.350000

6.060000

In [184]:

1

df['LogKOW'].fillna(df['LogKOW'].mean(),inplace=True)

2

df['logBCF'].fillna(df['logBCF'].mean(),inplace=True)

In [185]:

1

df.isnull().sum()

Out[185]:

CAS       0

LogKOW    0

logBCF    0

dtype: int64

In [186]:

1

df.drop(['CAS'],inplace=True,axis=1)

In [187]:

1

df.dtypes

Out[187]:

LogKOW    float64

logBCF    float64

dtype: object

In [188]:

1

df['LogKOW']=((df['LogKOW']-df['LogKOW'].min())/(df['LogKOW'].max()-df['LogKOW'].min()))

2

df['logBCF']=((df['logBCF']-df['logBCF'].min())/(df['logBCF'].max()-df['logBCF'].min()))

3

df.head()

Out[188]:

LogKOW

logBCF

0

0.281879

0.428571

1

0.266667

0.411007

2

0.267562

0.243560

3

0.224609

0.173302

4

0.312752

0.388759

In [189]:

1

df.head()

Out[189]:

LogKOW

logBCF

0

0.281879

0.428571

1

0.266667

0.411007

2

0.267562

0.243560

3

0.224609

0.173302

4

0.312752

0.388759

In [190]:

1

sb.pairplot(df)

Out[190]:

<seaborn.axisgrid.PairGrid at 0x1e357659388>

In [191]:

1

sb.heatmap(df.corr(),annot=True)

Out[191]:

<matplotlib.axes._subplots.AxesSubplot at 0x1e3571b10c8>

In [192]:

1

df.to_csv('ass2.csv')

In [193]:

1

x=df['LogKOW'].values[:,None]

2

y=df['logBCF'].values[:,None]

In [194]:

Training DATAset:

1

from sklearn.model_selection import train_test_split

2

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [195]:

1

print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

2

(846, 1) (212, 1) (846, 1) (212, 1)

In [196]:

1

from sklearn.linear_model import LinearRegression

2

from scipy.stats import pearsonr

3

lm=LinearRegression()

4

m=lm.fit(x,y)

5

print(m.coef_,m.intercept_)

6

[[1.05578626]] [0.15798267]

In [197]:

1

y_test=lm.predict(x_test)

2

In [198]:

1

plt.scatter(x,y,color='b')

2

plt.plot(x_test,y_test,color='black',linewidth=3)

3

plt.xlabel('Height in inches')

4

plt.ylabel('Weigth in Pounds')

5

plt.show()

MOdel PErfromance

In [199]:

1

from sklearn.model_selection import train_test_split

2

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [200]:

1

y_train_pred=lm.predict(x_train).ravel()

2

y_test_pred=lm.predict(x_test).ravel()

In [201]:

1

from sklearn.metrics import mean_squared_error as mse,r2_score

2

from sklearn.metrics import mean_absolute_error as mae

In [202]:

1

print("The Mean Squared Error on Train set is:\t{:0.1f}".format(mse(y_train,y_train_pred)))

2

print("The Mean Squared Error on Test set is:\t{:0.1f}".format(mse(y_test,y_test_pred)))

3

print("The Mean Average  Error on Train set is:\t{:0.1f}".format(mae(y_train,y_train_pred)))

4

print("The Mean Average  Error on Train set is:\t{:0.1f}".format(mae(y_test,y_test_pred)))

The Mean Squared Error on Train set is:        0.0

The Mean Squared Error on Test set is:        0.0

The Mean Average  Error on Train set is:        0.1

The Mean Average  Error on Train set is:        0.1

In [203]:

1

print("The R2 score on the Train set is:\t{:0.1f}".format(r2_score(y_train,y_train_pred)))

2

print("The R2 score on the Test set is:\t{:0.1f}".format(r2_score(y_test,y_test_pred)))

The R2 score on the Train set is:        0.4

The R2 score on the Test set is:        0.4

ACCURACY TEST:

from sklearn.model_selection import cross_val_score accuracy=cross_val_score(lm,x,y,cv=10,scoring='r2') accuracy

array([0.1348827 , 0.20549326, 0.42319296, 0.38670252, 0.22786572, 0.41307769, 0.57789581, 0.51368184, 0.47863654, 0.3871694 ])