Quite a while ago my amazing wife asked me if it was possible to find the time of the first goal for the 2006 FIFA World Cup matches.  I was using R at the time and thought it was possible.  Here are the scripts I wrote to scrape the info from the FIFA website.  They're also posted on my github here.

There are three scripts, One scrapes the data and saves it as CSV files, the next does some processing and saves the results as csv's and the third produces some basic graphs.

Web Scraping Script

# Script to Scrape web pages to collect World 
# Script to Scrape web pages to collect World Cup 2006 score data
#
# Author: Paul Hurley
###############################################################################
require(ggplot2)
require(plyr)
require(stringr)
require(RCurl)
require(XML)
#' Function to Sort a dataframe with a given list of columns
#' Cribbed from Spector, P. (2008). "Data Manipulation with R", UseR! Springer. Pg78
#' @param df Dataframe to be sorted
#' @param ... list of columns to sort on
#' @returnType
#' @return A sorted dataframe
#' @author "Paul Hurley"
#' @
export
#' @
#' @usage with(dataframe,sortframe(dataframe,column1, column2, column3))
#' @examples with(iris,sortframe(iris,Sepal.Length,Sepal.Width,Petal.Length))
sortframe<-function(df,...){df[do.call(order,list(...)),]}
goals<-function(match) {
theURL <-paste("http://www.fifa.com/worldcup/archive/germany2006/results/matches/match=974100",match,"/report.html",sep="")
webpage = tryCatch(getURL(theURL, header=FALSE, verbose=TRUE),
HTTPError = function(e) {
cat("HTTP error: ", e$message, "\n")
})
message(paste("Webpage size is ",nchar(webpage),sep=""))
webpagecont <- readlines="" tc="" -="" textconnection="" webpage="" close="" fifa="" doc="" -htmlparse="" webpagecont="" xpathsapply="" div="" class="cont" xmlvalue="" goals="" scored="" grep="" value="TRUE)" -gsub="" strsplit="" 1="" -strsplit="" table="" -as="" data="" frame="" matrix="" unlist="" ncol="3,byrow=TRUE))" names="" -c="" player="NA,Team=NA,Time=NA)" team="" time="" message="" paste="" there="" were="" nrow="" sep="" if="" 0="" -data="" now="" get="" the="" match="" details="" contains="" teams="" and="" final="" score="" 2="" number="" date="" venue="" attendance="" 0-9="" a-za-z="" fullmatch="" -fifa="" tempfifa="" matchdatetimevenue="" stadiumattendance="" -substr="" nchar="" -2="" tempfifa2="" -unlist="" -paste="" substr="" 2006="" 1000="" :="" 0-5="" return="" groupa="" 01="" 02="" 17="" 18="" 33="" 34="" groupb="" 03="" 04="" 19="" 20="" 35="" 36="" groupc="" 05="" 06="" 21="" 22="" 37="" 38="" groupd="" 07="" 08="" 23="" 25="" 39="" 40="" groupe="" 09="" 10="" 26="" 41="" 42="" groupf="" 11="" 12="" 27="" 28="" 43="" 44="" groupg="" 13="" 14="" 29="" 30="" 45="" 46="" grouph="" 15="" 16="" 31="" 32="" 47="" 48="" round16="" 49="" 50="" 51="" 52="" 53="" 54="" 55="" quater="" 57="" 58="" 59="" 60="" semi="" 61="" 62="" 64="" wooden="" 63="" groupar="" -ldply="" groupbr="" groupcr="" groupdr="" grouper="" groupfr="" groupgr="" grouphr="" round16r="" quaterr="" semir="" finalr="" woodenr="" datadir="" home="" paul="" workspace="" world_cup="" write="" csv="" world="" cup="" -rbind="" worldcup2006="" pre="">

The processing script

# TODO: Add comment
#
# Author: paul
###############################################################################
require(ggplot2)
require(plyr)
require(stringr)
require(RCurl)
require(XML)
sortframe<-function(df,...){df[do.call(order,list(...)),]}
datadir<-"/home/paul/workspace/world_cup/data/"
world.cup.2006<-read.csv(paste(datadir, "worldcup2006.csv", sep=""))
world.cup.2006$Timen<-as.numeric(str_extract(as.character(world.cup.2006$Time)," [0-9]*"))
teamgoals<-ddply(subset(world.cup.2006,!is.na(Team)),.(Team),nrow)
top5<-subset(world.cup.2006,Team %in% (with(teamgoals,sortframe(teamgoals,-V1))$Team[1:5]))
top5$Team<-factor(top5$Team)
write.csv(top5, paste(datadir, "top5.csv", sep=""))
firstgoal<-ddply(world.cup.2006,.(match),function(df) {
with(df,sortframe(df,Timen))
return(df[1,])
})
write.csv(firstgoal,paste(datadir, "firstgoals.csv", sep=""))
top5firstgoal<-ddply(top5,.(match),function(df) {
with(df,sortframe(df,Timen))
return(df[1,])
})
write.csv(top5firstgoal,paste(datadir, "top5firstgoal", sep=""))

and the graphs

# TODO: Add comment
#
# Author: paul
###############################################################################
require(ggplot2)
require(plyr)
require(stringr)
datadir<-"/home/paul/workspace/world_cup/data/"
firstgoal<-read.csv(file=paste(datadir, "
firstgoals.csv", sep=""))
top5firstgoal<-read.csv(file=paste(datadir, "top5firstgoal.csv", sep=""))
top5<-read.csv(file=paste(datadir, "top5.csv", sep=""))
print(qplot(Timen,data=firstgoal, geom="histogram", binwidth=1))
qplot(Timen,data=firstgoal, geom="histogram", binwidth=5)
qplot(Timen,data=firstgoal, geom="histogram", binwidth=10)
qplot(factor(Team), Timen, data=firstgoal, geom="boxplot")+geom_jitter()
qplot(factor(Team), Timen, data=world.cup.2006, geom="boxplot")+geom_jitter()
qplot(Timen,data=top5firstgoal, geom="histogram", binwidth=1)
qplot(Timen,data=top5firstgoal, geom="histogram", binwidth=5)
ggplot(top5firstgoal,aes(Timen, fill=Team))+geom_density(alpha=0.2)
qplot(factor(Team), Timen, data=top5firstgoal, geom="boxplot")+geom_jitter()
qplot(factor(Team), Timen, data=top5, geom="boxplot")+geom_jitter()

time2score byteam box

 

time2score hist

 

time2score by team