1 of 21

Application sous R

Nadia Bessoltane - INRAE

École de bioinformatique AVIESAN-IFB-INSERM 2022

2 of 21

Rappel : les objets sous R

# variable : stocker des valeurs numériques ou des chaînes de caractères

> string <- “hello world”

> value <- 10

# vecteur : stocker une liste de valeurs numériques ou de chaînes de caractères

> vect <- c(1,2,11,12)

# matrice : stocker un tableau 2D de valeurs numériques ou de chaîne de caractères

> mat <- matrix(c(1,2, 11,12), nrow = 2, ncol = 2, byrow = TRUE)

# data.frame : stocker des valeurs numériques et de chaînes de caractères

> df <- data.frame(c(1,11), c("x", "y"))

# liste : stocker des objets de nature différente

> list <- list(vector = vect, matrix = mat, dataframe = df)

> list

# objet S3/S4 (POO) : stocker des données structurées

2

1

2

11

12

1

2

11

12

1

x

11

y

3 of 21

3

https://www.tidyverse.org

4 of 21

tidy data

is a way to organize tabular data in a consistent data structure across packages.

4

> install.packages(“tibble”)

> library(tibble)

Tibbles are a table format provided by the tibble package. They inherit the data frame class, but have

improved behaviors

5 of 21

Data tidying

5

https://raw.githubusercontent.com/rstudio/cheatsheets/main/tidyr.pdf

> install.packages(“tidyr”)

> library(tidyr)

6 of 21

Data tidying with tidyr

unite / separate

6

name

mounth

year

1

dupont_lepetit

09

1945

2

jean_legrand

11

2000

3

toto_tutu

04

1820

unite(tb, month, year, col=“birth”, sep=”/”, remove=TRUE)

name

birth

1

dupont_lepetit

09/1945

2

jean_legrand

11/2000

3

toto_tutu

04/1820

7 of 21

Data tidying with tidyr

unite / separate

7

name

month

year

1

dupont_lepetit

09

1945

2

jean_legrand

11

2000

3

toto_tutu

04

1820

separate(tb, col = “name”, into = c(“Fname”, “Lname”), sep=”_”, remove=TRUE)

Fname

Lname

month

year

1

dupont

lepetit

09

1945

2

jean

legrand

11

2000

3

toto

tutu

04

1820

8 of 21

Data tidying with tidyr

unite / separate

8

name

month

year

1

dupont_lepetit

09

1945

2

jean_legrand

11

2000

3

toto_tutu

04

1820

tb.u <- unite(tb, month, year, col=“birth”, sep=”/”)

separate(tb.u, col = “name”, into = c(“Fname”, “Lname”), sep=”_”)

Fname

Lname

birth

1

dupont

lepetit

09/1945

2

jean

legrand

11/2000

3

toto

tutu

04/1820

unite(tb, month, year, col=“birth”, sep=”/”) %>%

separate(col = “name”, into = c(“Fname”, “Lname”), sep=”_”)

=

9 of 21

Data tidying with tidyr

spread / gather

9

name

month

year

1

dupont_lepetit

09

1945

2

jean_legrand

11

2000

3

toto_tutu

04

1820

gather(tb, key, value, -name)

name

key

value

1

dupont_lepetit

month

09

2

dupont_lepetit

year

1945

3

jean_legrand

month

11

4

jean_legrand

year

2000

5

toto_tutu

month

04

6

toto_tutu

year

1820

spread(tb, key, value)

10 of 21

Data transformation

10

https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-transformation.pdf

> install.packages(“dplyr”)

> library(dplyr)

11 of 21

Data transformation with dplyr

filter : picks cases based on their values

11

X

Y

Z

W

1

0

no

2

4

yes

3

10

yes

4

1

yes

X

Y

Z

W

1

4

yes

2

10

yes

filter(tb, X > 2, Y == “yes”)

12 of 21

Data transformation with dplyr

select : picks variables based on their names

12

X

Y

Z

W

1

2

3

4

select(tb, X,Z,W)

select(tb, -Y)

X

Z

W

1

2

3

4

13 of 21

Data transformation with dplyr

mutate : adds new variables that are functions of existing variables

13

X

Y

Z

1

a

b

2

c

d

3

e

f

4

g

h

mutate(tb, W = paste(X, Y, sep = “_”))

X

Y

Z

W

1

a

b

a_b

2

c

d

c_d

3

e

f

e_f

4

g

h

g_h

> paste(“hello”, “world”, sep = “ ”)

[1] "hello world"

14 of 21

Data transformation with dplyr

mutate / if_else

14

X

Y

Z

1

NA

b

2

c

d

3

e

f

4

g

h

X

Y

Z

W

1

NA

b

b

2

c

d

c_d

3

e

f

e_f

4

g

h

g_h

> x <- c(NA, 0, NA, 3)

> dplyr::if_else(is.na(x), 0, x)

[1] 0 0 0 3

mutate(tb, W = if_else(is.na(X), Y,

paste(X, Y sep = “_”)))

15 of 21

Data transformation with dplyr

summarie / group_by

15

X

Y

Z

1

1

a

2

2

a

3

3

a

4

2

b

summarise(tb, avgX = mean(X))

avgX

1

2

Y

avgX

countY

1

a

2

3

2

b

2

1

tb.g <- group_by(tb, Y)

summarise(tb.g , avgX = mean(X), countY = n())

16 of 21

Data transformation with dplyr

full_join /left_join / right_join

16

X

Y

Z

1

1

a

2

2

a

3

3

a

4

2

b

Y

avgX

countY

1

a

2

3

2

b

2

1

+

X

Y

Z

avgX

countY

1

1

a

2

3

2

2

a

2

3

3

3

a

2

3

4

2

b

2

1

full_join(tb1 , tb2, by = “Y”)

17 of 21

Data transformation

17

> install.packages(“ggplot2”)

> library(ggplot2)

https://github.com/rstudio/cheatsheets/blob/main/strings.pdf

https://stringr.tidyverse.org

18 of 21

Graphique

18

> install.packages(“ggplot2”)

> library(ggplot2)

https://thinkr.fr/pdf/ggplot2-french-cheatsheet.pdf

19 of 21

Les Graphiques avec ggplot2

geom_point

19

ggplot(data = tb) + geom_point(aes(x = taille, y = poids))

name

taille

poids

1

dupont_lepetit

180

80

2

jean_legrand

170

75

3

toto_tutu

160

55

20 of 21

Les Graphiques avec ggplot2

geom_line

20

ggplot(data = tb) + geom_line(aes(x = taille, y = poids))

name

taille

poids

1

dupont_lepetit

180

80

2

jean_legrand

170

75

3

toto_tutu

160

55

21 of 21

Les Graphiques avec ggplot2

geom_line

21

name

taille

poids

1

dupont_lepetit

180

80

2

jean_legrand

170

75

3

toto_tutu

160

55

ggplot(data = tb) +

geom_line(aes(x = taille, y = poids) , color="grey") + geom_point(aes(x = taille, y = poids, color=name))