1 of 21

Application sous R

Nadia Bessoltane - INRAE

École de bioinformatique AVIESAN-IFB-INSERM 2022

2 of 21

Rappel : les objets sous R

# variable : stocker des valeurs numériques ou des chaînes de caractères

> string <- “hello world”

> value <- 10

# vecteur : stocker une liste de valeurs numériques ou de chaînes de caractères

> vect <- c(1,2,11,12)

# matrice : stocker un tableau 2D de valeurs numériques ou de chaîne de caractères

> mat <- matrix(c(1,2, 11,12), nrow = 2, ncol = 2, byrow = TRUE)

# data.frame : stocker des valeurs numériques et de chaînes de caractères

> df <- data.frame(c(1,11), c("x", "y"))

# liste : stocker des objets de nature différente

> list <- list(vector = vect, matrix = mat, dataframe = df)

> list

# objet S3/S4 (POO) : stocker des données structurées

2

1	2	11	12

1	2
11	12

1	x
11	y

3 of 21

3

https://www.tidyverse.org

4 of 21

tidy data

is a way to organize tabular data in a consistent data structure across packages.

4

> install.packages(“tibble”)

> library(tibble)

Tibbles are a table format provided by the tibble package. They inherit the data frame class, but have

improved behaviors

5 of 21

Data tidying

5

https://raw.githubusercontent.com/rstudio/cheatsheets/main/tidyr.pdf

> install.packages(“tidyr”)

> library(tidyr)

6 of 21

Data tidying with tidyr

unite / separate

6

	name	mounth	year
1	dupont_lepetit	09	1945
2	jean_legrand	11	2000
3	toto_tutu	04	1820

unite(tb, month, year, col=“birth”, sep=”/”, remove=TRUE)

	name	birth
1	dupont_lepetit	09/1945
2	jean_legrand	11/2000
3	toto_tutu	04/1820

7 of 21

Data tidying with tidyr

unite / separate

7

	name	month	year
1	dupont_lepetit	09	1945
2	jean_legrand	11	2000
3	toto_tutu	04	1820

separate(tb, col = “name”, into = c(“Fname”, “Lname”), sep=”_”, remove=TRUE)

	Fname	Lname	month	year
1	dupont	lepetit	09	1945
2	jean	legrand	11	2000
3	toto	tutu	04	1820

8 of 21

Data tidying with tidyr

unite / separate

8

	name	month	year
1	dupont_lepetit	09	1945
2	jean_legrand	11	2000
3	toto_tutu	04	1820

tb.u <- unite(tb, month, year, col=“birth”, sep=”/”)

separate(tb.u, col = “name”, into = c(“Fname”, “Lname”), sep=”_”)

	Fname	Lname	birth
1	dupont	lepetit	09/1945
2	jean	legrand	11/2000
3	toto	tutu	04/1820

unite(tb, month, year, col=“birth”, sep=”/”) %>%

separate(col = “name”, into = c(“Fname”, “Lname”), sep=”_”)

=

9 of 21

Data tidying with tidyr

spread / gather

9

	name	month	year
1	dupont_lepetit	09	1945
2	jean_legrand	11	2000
3	toto_tutu	04	1820

gather(tb, key, value, -name)

	name	key	value
1	dupont_lepetit	month	09
2	dupont_lepetit	year	1945
3	jean_legrand	month	11
4	jean_legrand	year	2000
5	toto_tutu	month	04
6	toto_tutu	year	1820

spread(tb, key, value)

10 of 21

Data transformation

10

https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-transformation.pdf

> install.packages(“dplyr”)

> library(dplyr)

11 of 21

Data transformation with dplyr

filter : picks cases based on their values

11

	X	Y	Z	W
1	0	no
2	4	yes
3	10	yes
4	1	yes

	X	Y	Z	W
1	4	yes
2	10	yes

filter(tb, X > 2, Y == “yes”)

12 of 21

Data transformation with dplyr

select : picks variables based on their names

12

	X	Y	Z	W
1
2
3
4

select(tb, X,Z,W)

select(tb, -Y)

	X	Z	W
1
2
3
4

13 of 21

Data transformation with dplyr

mutate : adds new variables that are functions of existing variables

13

	X	Y	Z
1	a	b
2	c	d
3	e	f
4	g	h

mutate(tb, W = paste(X, Y, sep = “_”))

	X	Y	Z	W
1	a	b		a_b
2	c	d		c_d
3	e	f		e_f
4	g	h		g_h

> paste(“hello”, “world”, sep = “ ”)

[1] "hello world"

14 of 21

Data transformation with dplyr

mutate / if_else

14

	X	Y	Z
1	NA	b
2	c	d
3	e	f
4	g	h

	X	Y	Z	W
1	NA	b		b
2	c	d		c_d
3	e	f		e_f
4	g	h		g_h

> x <- c(NA, 0, NA, 3)

> dplyr::if_else(is.na(x), 0, x)

[1] 0 0 0 3

mutate(tb, W = if_else(is.na(X), Y,

paste(X, Y sep = “_”)))

15 of 21

Data transformation with dplyr

summarie / group_by

15

	X	Y	Z
1	1	a
2	2	a
3	3	a
4	2	b

summarise(tb, avgX = mean(X))

	avgX
1	2

	Y	avgX	countY
1	a	2	3
2	b	2	1

tb.g <- group_by(tb, Y)

summarise(tb.g , avgX = mean(X), countY = n())

16 of 21

Data transformation with dplyr

full_join /left_join / right_join

16

	X	Y	Z
1	1	a
2	2	a
3	3	a
4	2	b

	Y	avgX	countY
1	a	2	3
2	b	2	1

+

	X	Y	Z	avgX	countY
1	1	a		2	3
2	2	a		2	3
3	3	a		2	3
4	2	b		2	1

full_join(tb1 , tb2, by = “Y”)

17 of 21

Data transformation

17

> install.packages(“ggplot2”)

> library(ggplot2)

https://github.com/rstudio/cheatsheets/blob/main/strings.pdf

https://stringr.tidyverse.org

18 of 21

Graphique

18

> install.packages(“ggplot2”)

> library(ggplot2)

https://thinkr.fr/pdf/ggplot2-french-cheatsheet.pdf

19 of 21

Les Graphiques avec ggplot2

geom_point

19

ggplot(data = tb) + geom_point(aes(x = taille, y = poids))

	name	taille	poids
1	dupont_lepetit	180	80
2	jean_legrand	170	75
3	toto_tutu	160	55

20 of 21

Les Graphiques avec ggplot2

geom_line

20

ggplot(data = tb) + geom_line(aes(x = taille, y = poids))

	name	taille	poids
1	dupont_lepetit	180	80
2	jean_legrand	170	75
3	toto_tutu	160	55

21 of 21

Les Graphiques avec ggplot2

geom_line

21

	name	taille	poids
1	dupont_lepetit	180	80
2	jean_legrand	170	75
3	toto_tutu	160	55

ggplot(data = tb) +

geom_line(aes(x = taille, y = poids) , color="grey") + geom_point(aes(x = taille, y = poids, color=name))