Page 1 of 7

# Survival Analysis in R

# Copyright 2013 by Ani Katchova

# install.packages("survival")

library(survival)

mydata<- read.csv("C:/Econometrics/Data/survival_unemployment.csv")

attach(mydata)

# Define variables

time <- spell

event <- event

X <- cbind(logwage, ui, age)

group <- ui

# Descriptive statistics

summary(time)

summary(event)

summary(X)

summary(group)

# Kaplan-Meier non-parametric analysis

kmsurvival <- survfit(Surv(time,event) ~ 1)

summary(kmsurvival)

plot(kmsurvival, xlab="Time", ylab="Survival Probability")

# Kaplan-Meier non-parametric analysis by group

kmsurvival1 <- survfit(Surv(time, event) ~ group)

summary(kmsurvival1)

plot(kmsurvival1, xlab="Time", ylab="Survival Probability")

# Nelson-Aalen non-parametric analysis

nasurvival <- survfit(coxph(Surv(time,event)~1), type="aalen")

summary(nasurvival)

plot(nasurvival, xlab="Time", ylab="Survival Probability")

# Cox proportional hazard model - coefficients and hazard rates

coxph <- coxph(Surv(time,event) ~ X, method="breslow")

summary(coxph)

# Exponential, Weibull, and log-logistic parametric model coefficients

# Opposite signs from Stata results, Weibull results differ; same as SAS

exponential <- survreg(Surv(time,event) ~ X, dist="exponential")

summary(exponential)

weibull <- survreg(Surv(time,event) ~ X, dist="weibull")

summary(weibull)

loglogistic <- survreg(Surv(time,event) ~ X, dist="loglogistic")

summary(loglogistic)

Page 2 of 7

> # Survival Analysis in R

> # Copyright 2013 by Ani Katchova

>

> # install.packages("survival")

> library(survival)

Loading required package: splines

>

> mydata<- read.csv("C:/Econometrics/Data/survival_unemployment.csv")

> attach(mydata)

>

> # Define variables

> time <- spell

> event <- event

> X <- cbind(logwage, ui, age)

> group <- ui

>

> # Descriptive statistics

> summary(time)

Min. 1st Qu. Median Mean 3rd Qu. Max.

1.000 2.000 5.000 6.248 9.000 28.000

> summary(event)

Min. 1st Qu. Median Mean 3rd Qu. Max.

0.000 0.000 0.000 0.321 1.000 1.000

> summary(X)

logwage ui age

Min. :2.708 Min. :0.0000 Min. :20.00

1st Qu.:5.298 1st Qu.:0.0000 1st Qu.:27.00

Median :5.677 Median :1.0000 Median :34.00

Mean :5.693 Mean :0.5528 Mean :35.44

3rd Qu.:6.052 3rd Qu.:1.0000 3rd Qu.:43.00

Max. :7.600 Max. :1.0000 Max. :61.00

> summary(group)

Min. 1st Qu. Median Mean 3rd Qu. Max.

0.0000 0.0000 1.0000 0.5528 1.0000 1.0000

>

> # Kaplan-Meier non-parametric analysis

> kmsurvival <- survfit(Surv(time,event) ~ 1)

> summary(kmsurvival)

Call: survfit(formula = Surv(time, event) ~ 1)

time n.risk n.event survival std.err lower 95% CI upper 95% CI

1 3343 294 0.912 0.00490 0.903 0.922

2 2803 178 0.854 0.00622 0.842 0.866

3 2321 119 0.810 0.00708 0.797 0.824

4 1897 56 0.786 0.00756 0.772 0.801

5 1676 104 0.738 0.00847 0.721 0.754

6 1339 32 0.720 0.00882 0.703 0.737

7 1196 85 0.669 0.00979 0.650 0.688

8 933 15 0.658 0.01001 0.639 0.678

9 848 33 0.632 0.01057 0.612 0.654

10 717 3 0.630 0.01064 0.609 0.651

11 659 26 0.605 0.01128 0.583 0.627

12 556 7 0.597 0.01150 0.575 0.620

13 509 25 0.568 0.01234 0.544 0.593

14 415 30 0.527 0.01353 0.501 0.554

Page 3 of 7

15 311 19 0.495 0.01458 0.467 0.524

16 252 10 0.475 0.01527 0.446 0.506

17 201 8 0.456 0.01606 0.426 0.489

18 169 7 0.437 0.01691 0.405 0.472

19 149 4 0.426 0.01744 0.393 0.461

20 130 3 0.416 0.01794 0.382 0.452

21 109 4 0.400 0.01883 0.365 0.439

22 82 4 0.381 0.02029 0.343 0.423

26 48 2 0.365 0.02233 0.324 0.412

27 33 5 0.310 0.02964 0.257 0.374

> plot(kmsurvival, xlab="Time", ylab="Survival Probability")

>

> # Kaplan-Meier non-parametric analysis by group

> kmsurvival1 <- survfit(Surv(time, event) ~ group)

> summary(kmsurvival1)

Call: survfit(formula = Surv(time, event) ~ group)

group=0

time n.risk n.event survival std.err lower 95% CI upper 95% CI

1 1495 266 0.822 0.00989 0.803 0.842

2 1038 116 0.730 0.01191 0.707 0.754

3 717 55 0.674 0.01317 0.649 0.701

4 501 20 0.647 0.01396 0.620 0.675

5 423 36 0.592 0.01550 0.563 0.623

6 305 8 0.577 0.01603 0.546 0.609

7 265 28 0.516 0.01801 0.482 0.552

8 191 4 0.505 0.01842 0.470 0.542

9 176 5 0.491 0.01898 0.455 0.529

10 151 1 0.487 0.01913 0.451 0.526

11 141 6 0.467 0.02010 0.429 0.508

12 116 1 0.463 0.02033 0.424 0.504

13 111 5 0.442 0.02144 0.402 0.486

14 91 9 0.398 0.02376 0.354 0.447

15 73 3 0.382 0.02459 0.336 0.433

16 61 3 0.363 0.02566 0.316 0.417

17 45 4 0.331 0.02799 0.280 0.390

18 39 3 0.305 0.02944 0.253 0.369

19 35 1 0.297 0.02986 0.243 0.361

26 15 1 0.277 0.03379 0.218 0.352

27 11 1 0.252 0.03897 0.186 0.341

group=1

time n.risk n.event survival std.err lower 95% CI upper 95% CI

1 1848 28 0.985 0.00284 0.979 0.990

2 1765 62 0.950 0.00511 0.940 0.960

3 1604 64 0.912 0.00676 0.899 0.926

4 1396 36 0.889 0.00764 0.874 0.904

5 1253 68 0.841 0.00919 0.823 0.859

6 1034 24 0.821 0.00980 0.802 0.841

7 931 57 0.771 0.01124 0.749 0.793

8 742 11 0.759 0.01159 0.737 0.782

9 672 28 0.728 0.01255 0.704 0.753

10 566 2 0.725 0.01264 0.701 0.750

11 518 20 0.697 0.01362 0.671 0.724

12 440 6 0.688 0.01397 0.661 0.716

Page 4 of 7

13 398 20 0.653 0.01526 0.624 0.684

14 324 21 0.611 0.01683 0.579 0.645

15 238 16 0.570 0.01857 0.534 0.607

16 191 7 0.549 0.01949 0.512 0.588

17 156 4 0.535 0.02022 0.497 0.576

18 130 4 0.518 0.02121 0.478 0.562

19 114 3 0.505 0.02207 0.463 0.550

20 99 3 0.489 0.02310 0.446 0.537

21 81 4 0.465 0.02492 0.419 0.517

22 61 4 0.435 0.02756 0.384 0.492

26 33 1 0.422 0.02970 0.367 0.484

27 22 4 0.345 0.04233 0.271 0.439

> plot(kmsurvival1, xlab="Time", ylab="Survival Probability")

>

> # Nelson-Aalen non-parametric analysis

> nasurvival <- survfit(coxph(Surv(time,event)~1), type="aalen")

> summary(nasurvival)

Call: survfit(formula = coxph(Surv(time, event) ~ 1), type = "aalen")

time n.risk n.event survival std.err lower 95% CI upper 95% CI

1 3343 294 0.916 0.00470 0.907 0.925

2 2803 178 0.859 0.00601 0.848 0.871

3 2321 119 0.817 0.00688 0.803 0.830

4 1897 56 0.793 0.00738 0.778 0.807

5 1676 104 0.745 0.00828 0.729 0.761

6 1339 32 0.727 0.00865 0.711 0.745

7 1196 85 0.678 0.00960 0.659 0.697

8 933 15 0.667 0.00985 0.648 0.686

9 848 33 0.641 0.01042 0.621 0.662

10 717 3 0.639 0.01049 0.618 0.660

11 659 26 0.614 0.01115 0.592 0.636

12 556 7 0.606 0.01138 0.584 0.629

13 509 25 0.577 0.01223 0.554 0.602

14 415 30 0.537 0.01340 0.511 0.564

15 311 19 0.505 0.01446 0.478 0.534

16 252 10 0.485 0.01517 0.457 0.516

17 201 8 0.467 0.01599 0.436 0.499

18 169 7 0.448 0.01687 0.416 0.482

19 149 4 0.436 0.01743 0.403 0.471

20 130 3 0.426 0.01795 0.392 0.462

21 109 4 0.410 0.01887 0.375 0.449

22 82 4 0.391 0.02035 0.353 0.433

26 48 2 0.375 0.02243 0.333 0.422

27 33 5 0.322 0.02912 0.270 0.385

> plot(nasurvival, xlab="Time", ylab="Survival Probability")

>

>

> # Cox proportional hazard model - coefficients and hazard rates

> coxph <- coxph(Surv(time,event) ~ X, method="breslow")

> summary(coxph)

Call:

coxph(formula = Surv(time, event) ~ X, method = "breslow")

n= 3343, number of events= 1073

Page 5 of 7

coef exp(coef) se(coef) z Pr(>|z|)

Xlogwage 0.461553 1.586535 0.057190 8.070 6.66e-16 ***

Xui -0.979578 0.375470 0.063979 -15.311 < 2e-16 ***

Xage -0.010850 0.989209 0.003132 -3.465 0.000531 ***

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

exp(coef) exp(-coef) lower .95 upper .95

Xlogwage 1.5865 0.6303 1.4183 1.7747

Xui 0.3755 2.6633 0.3312 0.4256

Xage 0.9892 1.0109 0.9832 0.9953

Concordance= 0.693 (se = 0.011 )

Rsquare= 0.081 (max possible= 0.992 )

Likelihood ratio test= 281.5 on 3 df, p=0

Wald test = 286.3 on 3 df, p=0

Score (logrank) test = 300 on 3 df, p=0

>

>

> # Exponential, Weibull, and log-logistic parametric model coefficients

> # Opposite signs from Stata results, Weibull results differ; same as SAS

> exponential <- survreg(Surv(time,event) ~ X, dist="exponential")

> summary(exponential)

Call:

survreg(formula = Surv(time, event) ~ X, dist = "exponential")

Value Std. Error z p

(Intercept) 4.6426 0.30841 15.05 3.27e-51

Xlogwage -0.4810 0.05678 -8.47 2.43e-17

Xui 1.0775 0.06269 17.19 3.29e-66

Xage 0.0126 0.00312 4.05 5.18e-05

Scale fixed at 1

Exponential distribution

Loglik(model)= -4083.8 Loglik(intercept only)= -4258.4

Chisq= 349.26 on 3 degrees of freedom, p= 0

Number of Newton-Raphson Iterations: 5

n= 3343

>

> weibull <- survreg(Surv(time,event) ~ X, dist="weibull")

> summary(weibull)

Call:

survreg(formula = Surv(time, event) ~ X, dist = "weibull")

Value Std. Error z p

(Intercept) 4.4784 0.29145 15.37 2.78e-53

Xlogwage -0.4567 0.05343 -8.55 1.26e-17

Xui 1.0352 0.06014 17.21 2.15e-66

Xage 0.0125 0.00292 4.28 1.90e-05

Log(scale) -0.0695 0.02328 -2.99 2.81e-03

Page 6 of 7

Scale= 0.933

Weibull distribution

Loglik(model)= -4079.5 Loglik(intercept only)= -4258.2

Chisq= 357.57 on 3 degrees of freedom, p= 0

Number of Newton-Raphson Iterations: 5

n= 3343

>

> loglogistic <- survreg(Surv(time,event) ~ X, dist="loglogistic")

> summary(loglogistic)

Call:

survreg(formula = Surv(time, event) ~ X, dist = "loglogistic")

Value Std. Error z p

(Intercept) 4.0408 0.31258 12.93 3.16e-38

Xlogwage -0.4622 0.05653 -8.18 2.94e-16

Xui 1.2099 0.05939 20.37 3.07e-92

Xage 0.0106 0.00291 3.64 2.77e-04

Log(scale) -0.3063 0.02437 -12.57 3.20e-36

Scale= 0.736

Log logistic distribution

Loglik(model)= -4014.1 Loglik(intercept only)= -4232

Chisq= 435.7 on 3 degrees of freedom, p= 0

Number of Newton-Raphson Iterations: 4

n= 3343

Page 7 of 7