5 Formule 1

Oui c’est mon deuxième article sur le sport et surement pas le dernier. Mais promis les prochains seront plus diversifiés.
J’adore ce sport. Je ne sais pas d’où m’est venue cette passion. Peut être le fait d’avoir eu la chaîne CFI (RIP) comme seul moyen de regarder le sport gratuitement? Ou le bruit des V10? Ou certainement qu’elle est une compétition très concurrentielle?…
J’ai jamais eu la réponse mais j’aime ce sport malgré qu’il faut payer pour le regarder (ha j’ai pas Canal en plus), que les plus pollueurs sur cette terre veulent faire bonne figuration avec leur V6 et qu’à chaque changement de règles une équipe, en toute décontraction, s’assure de gagner 3 voir 4 années de suites. N’empêche que le rendez vous est déjà pris pour le 17 mars.

Pour l’instant, cet article ne repose que sur l’extraction des données. Une deuxième partie d’études plus poussées sera proposée prochainement.

Il y’a 11 bases de données de 1950 à 2017.

library(tidyverse)
library(lubridate)
library(ggmap)
library(ggthemes)
library(plotly)
library(kableExtra)
library(funModeling)
library(circlize)
library(colorspace)

5.1 Importation des données

circuits <- read_csv("Data/formula1/circuits.csv") %>% glimpse()
## Observations: 73
## Variables: 9
## $ circuitId  <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ circuitRef <chr> "albert_park", "sepang", "bahrain", "catalunya", "i...
## $ name       <chr> "Albert Park Grand Prix Circuit", "Sepang Internati...
## $ location   <chr> "Melbourne", "Kuala Lumpur", "Sakhir", "Montmel\xcc...
## $ country    <chr> "Australia", "Malaysia", "Bahrain", "Spain", "Turke...
## $ lat        <dbl> -37.84970, 2.76083, 26.03250, 41.57000, 40.95170, 4...
## $ lng        <dbl> 144.968000, 101.738000, 50.510600, 2.261110, 29.405...
## $ alt        <int> 10, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ url        <chr> "http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_...
constructeurs <- read_csv("Data/formula1/constructors.csv") %>% glimpse()
## Observations: 208
## Variables: 6
## $ constructorId  <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ...
## $ constructorRef <chr> "mclaren", "bmw_sauber", "williams", "renault",...
## $ name           <chr> "McLaren", "BMW Sauber", "Williams", "Renault",...
## $ nationality    <chr> "British", "German", "British", "French", "Ital...
## $ url            <chr> "http://en.wikipedia.org/wiki/McLaren", "http:/...
## $ X6             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
consResults <- read_csv("Data/formula1/constructorResults.csv") %>% glimpse()
## Observations: 11,142
## Variables: 5
## $ constructorResultsId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...
## $ raceId               <int> 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...
## $ constructorId        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 6, 2, ...
## $ points               <int> 14, 8, 9, 5, 2, 1, 0, 0, 0, 0, 0, 10, 11,...
## $ status               <chr> "NULL", "NULL", "NULL", "NULL", "NULL", "...
constructeurStandings <- read_csv("Data/formula1/constructorStandings.csv") %>% glimpse()
## Observations: 11,896
## Variables: 8
## $ constructorStandingsId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...
## $ raceId                 <int> 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,...
## $ constructorId          <int> 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, ...
## $ points                 <int> 14, 8, 9, 5, 2, 1, 24, 19, 9, 6, 2, 11,...
## $ position               <int> 1, 3, 2, 4, 5, 6, 1, 2, 4, 5, 8, 3, 6, ...
## $ positionText           <chr> "1", "3", "2", "4", "5", "6", "1", "2",...
## $ wins                   <int> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...
## $ X8                     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
drivers <- read_csv("Data/formula1/drivers.csv") %>% glimpse()
## Observations: 842
## Variables: 9
## $ driverId    <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ driverRef   <chr> "hamilton", "heidfeld", "rosberg", "alonso", "kova...
## $ number      <int> 44, NA, 6, 14, NA, NA, NA, 7, NA, NA, NA, NA, 19, ...
## $ code        <chr> "HAM", "HEI", "ROS", "ALO", "KOV", "NAK", "BOU", "...
## $ forename    <chr> "Lewis", "Nick", "Nico", "Fernando", "Heikki", "Ka...
## $ surname     <chr> "Hamilton", "Heidfeld", "Rosberg", "Alonso", "Kova...
## $ dob         <chr> "07/01/1985", "10/05/1977", "27/06/1985", "29/07/1...
## $ nationality <chr> "British", "German", "German", "Spanish", "Finnish...
## $ url         <chr> "http://en.wikipedia.org/wiki/Lewis_Hamilton", "ht...
driverStandings <- read_csv("Data/formula1/driverStandings.csv") %>% glimpse()
## Observations: 31,726
## Variables: 7
## $ driverStandingsId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ raceId            <int> 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, ...
## $ driverId          <int> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,...
## $ points            <int> 10, 8, 6, 5, 4, 3, 2, 1, 14, 11, 6, 6, 10, 3...
## $ position          <int> 1, 2, 3, 4, 5, 6, 7, 8, 1, 3, 6, 7, 4, 9, 10...
## $ positionText      <int> 1, 2, 3, 4, 5, 6, 7, 8, 1, 3, 6, 7, 4, 9, 10...
## $ wins              <int> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
lapTimes <- read_csv("Data/formula1/lapTimes.csv") %>% glimpse()
## Observations: 426,633
## Variables: 6
## $ raceId       <int> 841, 841, 841, 841, 841, 841, 841, 841, 841, 841,...
## $ driverId     <int> 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 2...
## $ lap          <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
## $ position     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 2, 1...
## $ time         <time> 01:38:00, 01:33:00, 01:32:00, 01:32:00, 01:32:00...
## $ milliseconds <int> 98109, 93006, 92713, 92803, 92342, 92605, 92502, ...
races <- read_csv("Data/formula1/races.csv") %>% glimpse()
## Observations: 997
## Variables: 8
## $ raceId    <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ year      <int> 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009...
## $ round     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ circuitId <int> 1, 2, 17, 3, 4, 6, 5, 9, 20, 11, 12, 13, 14, 15, 22,...
## $ name      <chr> "Australian Grand Prix", "Malaysian Grand Prix", "Ch...
## $ date      <date> 2009-03-29, 2009-04-05, 2009-04-19, 2009-04-26, 200...
## $ time      <time> 06:00:00, 09:00:00, 07:00:00, 12:00:00, 12:00:00, 1...
## $ url       <chr> "http://en.wikipedia.org/wiki/2009_Australian_Grand_...
results <- read_csv("Data/formula1/results.csv") %>% glimpse()
## Observations: 23,777
## Variables: 18
## $ resultId        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ raceId          <int> 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18...
## $ driverId        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ constructorId   <int> 1, 2, 3, 4, 1, 3, 5, 6, 2, 7, 8, 4, 6, 9, 7, 1...
## $ number          <int> 22, 3, 7, 5, 23, 8, 14, 1, 4, 12, 18, 6, 2, 9,...
## $ grid            <int> 1, 5, 7, 11, 3, 13, 17, 15, 2, 18, 19, 20, 4, ...
## $ position        <int> 1, 2, 3, 4, 5, 6, 7, 8, NA, NA, NA, NA, NA, NA...
## $ positionText    <chr> "1", "2", "3", "4", "5", "6", "7", "8", "R", "...
## $ positionOrder   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ points          <int> 10, 8, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ laps            <int> 58, 58, 58, 58, 58, 57, 55, 53, 47, 43, 32, 30...
## $ time            <chr> "34:50.6", "5.478", "8.163", "17.181", "18.014...
## $ milliseconds    <int> 5690616, 5696094, 5698779, 5707797, 5708630, N...
## $ fastestLap      <int> 39, 41, 41, 58, 43, 50, 22, 20, 15, 23, 24, 20...
## $ rank            <int> 2, 3, 5, 7, 1, 14, 12, 4, 9, 13, 15, 16, 6, 11...
## $ fastestLapTime  <time> 01:27:00, 01:27:00, 01:28:00, 01:28:00, 01:27...
## $ fastestLapSpeed <dbl> 218.300, 217.586, 216.719, 215.464, 218.385, 2...
## $ statusId        <int> 1, 1, 1, 1, 1, 11, 5, 5, 4, 3, 7, 8, 5, 4, 10,...

5.2 Nettoyage des données

circuits$circuitRef <- gsub("_", " ",circuits$circuitRef) %>% str_to_title()

circuits$location <- as.factor(circuits$location)
circuits$country <- as.factor(circuits$country)
circuits$circuitRef <- as.factor(circuits$circuitRef)

constructeurs$name <- as.factor(constructeurs$name)

drivers$nationality <- str_replace_all(drivers$nationality,
                                       c("American-Italian" = "American",
                                       "Argentine-Italian" = "Argentine",
                                       "East German" = "German", "Dutch" = "German"))
drivers$nationality <- as.factor(drivers$nationality)
drivers$dob[drivers$driverRef=="chiron"] <- "03/08/1899"
drivers$dob[drivers$driverRef=="fagioli"] <- "02/07/1898"
drivers$dob[drivers$driverRef=="dusio" 
            | drivers$driverRef=="reed"] <- "13/10/1899"
drivers$dob[drivers$driverRef=="etancelin"] <- "28/12/1896"
drivers$dob[drivers$driverRef=="brudes"] <- "15/10/1899"
drivers$dob[drivers$driverRef=="legat"] <- "01/11/1898"
drivers$dob[drivers$driverRef=="biondetti"] <- "18/08/1898"
drivers$dob <- as.Date(dmy(drivers$dob))

results[is.na(results$fastestLapSpeed),"fastestLapSpeed"] <- 0
results[is.na(results$fastestLapTime),"fastestLapTime"] <- 0

5.3 Jointure des bases de données

formule1 <- full_join(circuits,races, by="circuitId") %>%
  full_join(results, by="raceId") %>%
  full_join(drivers,by="driverId") %>%
  full_join(constructeurs, by="constructorId") %>%
  select(circuitRef, location, country, lat, lng, date, year,
         pilot=driverRef, birthday=dob, nationality=nationality.x, 
         constructor=name,
         grid, positionOrder, positionText, points=points, laps, 
         fastestLapTime, fastestLapSpeed
         ) 
f1 <-filter(formule1,!is.na(pilot))

f1$pilot<- as.factor(str_to_title(f1$pilot))


df_status(f1,print_results = F) %>%
  select(variable,q_zeros,q_na,type,unique) %>%
  kable(booktab=T) %>%
  kable_styling(full_width = T)
variable q_zeros q_na type unique
circuitRef 0 0 factor 72
location 0 0 factor 69
country 0 0 factor 32
lat 0 0 numeric 71
lng 0 0 numeric 71
date 0 0 Date 976
year 0 0 integer 68
pilot 0 0 factor 842
birthday 0 0 Date 823
nationality 0 0 factor 37
constructor 0 0 factor 207
grid 1566 0 integer 35
positionOrder 0 0 integer 39
positionText 0 0 character 39
points 17097 39 integer 21
laps 2456 0 integer 172
fastestLapTime 18394 0 hms-difftime 79
fastestLapSpeed 18395 0 numeric 5144

5.4 Application Shiny

Vous pouvez accéder à mon application Shiny par CE LIEN pour voir la suite de l’étude.