19.3 YouTube

YouTube Ads
YouTube Analytics and Reporting APIs: All YouTube Analytics and YouTube Reporting API requests must be authorized by the channel or content owner that owns the requested data

19.3.1 OAuth

Go to link to access your API

library(tuber)
app_id = "YOUR APP ID"
app_secret = "YOUR APP SECRET"
yt_oauth(app_id, app_secret)


get_stats(video_id = "N708P-A45D0")

Note for Ubuntu user

httr::set_config(httr::config( ssl_verifypeer = 0L ) )

Get info about a video

get_video_details(video_id="N708P-A45D0")

Get caption of a video

get_captions(video_id="yJXTXN4xrI8")

Search Videos

res <- yt_search(term = "test")
head(res[, 1:3])

Get Comments of a video

res <- get_comment_threads(c(video_id="N708P-A45D0"))
head(res)

Get All the Comments Including Replies

get_all_comments(video_id = "a-UQz7fqR3w")

Get statistics of all the videos in a channel

a <- list_channel_resources(filter = c(channel_id = "UCT5Cx1l4IS3wHkJXNyuj4TA"), part="contentDetails")

# Uploaded playlists:
playlist_id <- a$items[[1]]$contentDetails$relatedPlaylists$uploads

# Get videos on the playlist
vids <- get_playlist_items(filter= c(playlist_id=playlist_id)) 

# Video ids
vid_ids <- as.vector(vids$contentDetails.videoId)

# Function to scrape stats for all vids
get_all_stats <- function(id) {
  get_stats(id)
} 

# Get stats and convert results to data frame 
res <- lapply(vid_ids, get_all_stats)
res_df <- do.call(rbind, lapply(res, data.frame))

head(res_df)

19.3.2 API

require(curl)
require(jsonlite)
library(kableExtra)
library(dplyr)
library(ggplot2)
library(plotly)
library(reshape2)

API_key = "YOUR API KEY"

getstats_video<-function(video_id,API_key){
 
  url=paste0("https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics&id=",video_id,"&key=",API_key)
  result <- fromJSON(txt=url)  
  salida=list()
  return(data.frame(name=result$items$snippet$channelTitle, result$items$statistics,title=result$items$snippet$title,date=result$items$snippet$publishedAt,descrip=result$items$snippet$description))
}

get_playlist_canal<-function(id,API_key,topn=15){
  
  url=paste0('https://www.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=',id,'&key=',API_key,'&maxResults=',topn)
  
  result=fromJSON(txt=url)
  return(data.frame(result$items$contentDetails))
}

getstats_canal<-function(id,API_key){

url=paste0('https://www.googleapis.com/youtube/v3/channels?part=snippet%2CcontentDetails%2Cstatistics&id=',id,'&key=',API_key)
  
result <- fromJSON(txt=url)   
return(data.frame(name=result$items$snippet$title,result$items$statistics,pl_list_id=result$items$contentDetails$relatedPlaylists))
  
}

getall_channels<-function(ids,API_key,topn=5){
  
  videos=lapply(ids,FUN=get_playlist_canal,API_key=API_key,topn=topn) %>% bind_rows()
  
  stats=lapply(videos[,1],FUN=getstats_video,API_key=API_key)
  stats=bind_rows(stats)
  stats$vid_id=videos[,1]
  return(stats)
}

Statistics per Channel

can_st=lapply(comp_data$cha_id,FUN = getstats_canal,API_key=API_key)
can_st=bind_rows(can_st)
can_st$viewCount=as.numeric(can_st$viewCount)
can_st[,1:6] %>% kable() %>%kable_styling()

can_st$viewCount=round(as.numeric(can_st$viewCount)/1000000,2)
p1=can_st %>% ggplot(aes(x=reorder(name,viewCount),y=viewCount,fill=name))+
geom_bar(stat="sum")+guides(size=F)+coord_flip()+scale_fill_manual(values = c("red", "darkblue", "yellow2"))+
geom_text(inherit.aes = T,aes(label=paste(viewCount,"M")),nudge_y =0,angle = 90)+
labs(x="Total Visualizations(Millions)",y="Visualizations",fill="")+
theme(legend.position = "top")+mytheme3
  
  ggplotly(p1,tooltip=c("name","viewCount"))  %>% 
    layout(legend = list(orientation = "h",x = 0.01, y = -0.1,autosize=F))

Information on Individual Videos per Channel

var_to_see="dislikeCount" #favoriteCount or commentCount

info=getall_channels(ids = can_st$pl_list_id.uploads,API_key = API_key,topn =20)


 datacond=melt(info[,c(1:6,8)],id.vars = c("name","date"))
 datacond$date=as.Date(datacond$date)
 datacond$value=as.numeric(datacond$value)
    
    
ggplot(filter(datacond,variable==var_to_see),aes(x=as.Date(date),y=value,fill=name))+
geom_bar(stat="sum")+labs(x=var_to_see,y="",fill="")+guides(size=FALSE)+scale_fill_manual(values = c("red", "darkblue", "yellow2"))+theme(legend.position = "top")+
  scale_x_date(limits =as.Date(c(as.Date(min(datacond$date)),as.Date(Sys.time()))),date_breaks  ="month",date_labels="%b %y")+theme(legend.position = "top")+mytheme2

19.3.3 Python

Pytube

# from pytube import YouTube
# YouTube('https://youtube.com/watch?v=2lAe1cqCOXo').streams.first().download()

import requests
import json
r = requests.get("http://gdata.youtube.com/feeds/api/standardfeeds/top_rated?v=2&alt=jsonc")
#r.text
data = json.loads(r.text)
data
# for item in data['data']['items']:
#   print "Video Title: %s" % (item['title'])
#   print "Video Category: %s" % (item['category'])
#   print "Video ID: %s" % (item['id'])
#   print "Video Rating: %f" % (item['rating'])
#   print "Embed URL: %s" % (item['player']['default'])

Using BeautifulSoup

from requests_html import HTMLSession 
from bs4 import BeautifulSoup as bs # importing BeautifulSoup

# sample youtube video url
video_url = "https://www.youtube.com/watch?v=jNQXAC9IVRw"
# init an HTML Session
session = HTMLSession()
# get the html content
response = session.get(video_url)
# execute Java-script
response.html.render(sleep=1)
# create bs object to parse HTML
soup = bs(response.html.html, "html.parser")
# write all HTML code into a file
open("video.html", "w", encoding='utf8').write(response.html.html)

Generate random video

import json
import urllib.request
import string
import random

count = 50
API_KEY = 'your_key'
random = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(3))

urlData = "https://www.googleapis.com/youtube/v3/search?key={}&maxResults={}&part=snippet&type=video&q={}".format(API_KEY,count,random)
webURL = urllib.request.urlopen(urlData)
data = webURL.read()
encoding = webURL.info().get_content_charset('utf-8')
results = json.loads(data.decode(encoding))

for data in results['items']:
    videoId = (data['id']['videoId'])
    print(videoId)
    #store your ids

Random used by YouTube API, test

# -*- coding: utf-8 -*-

# Sample Python code for youtube.search.list
# See instructions for running these code samples locally:
# https://developers.google.com/explorer-help/guides/code_samples#python

import os

import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors

scopes = ["https://www.googleapis.com/auth/youtube.force-ssl"]

def main():
    # Disable OAuthlib's HTTPS verification when running locally.
    # *DO NOT* leave this option enabled in production.
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

    api_service_name = "youtube"
    api_version = "v3"
    client_secrets_file = "YOUR_CLIENT_SECRET_FILE.json"

    # Get credentials and create an API client
    flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
        client_secrets_file, scopes)
    credentials = flow.run_console()
    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, credentials=credentials)

    request = youtube.search().list(
        part="snippet",
        maxResults=25,
        q="surfing"
    )
    response = request.execute()

    print(response)

if __name__ == "__main__":
    main()

Another way, but this is a dirty crawler, not for production

import re, urllib
from random import randint

def random_str(str_size):
    res = ""
    for i in xrange(str_size):
        x = randint(0,25)
        c = chr(ord('a')+x)
        res += c
    return res

def find_watch(text,pos):
    start = text.find("watch?v=",pos)
    if (start<0):
        return None,None
    end = text.find(" ",start)
    if (end<0):
        return None,None

    if (end-start > 200): #silly heuristics, probably not a must
        return None,None

    return text[start:end-1], start


def find_instance_links():
    base_url = 'https://www.youtube.com/results?search_query='
    url = base_url+random_str(3)
    #print url

    r = urllib.urlopen(url).read()

    links = {}

    pos = 0
    while True:
        link,pos =  find_watch(r,pos)
        if link == None or pos == None:
            break
        pos += 1
        #print link
        if (";" in link):
            continue
        links[link] = 1

    items_list = links.items()

    list_size = len(items_list)
    selected = randint(list_size/2,list_size-1)
    return items_list[selected][0]


for i in xrange(1000):
    sleep(randint(7,20)) # pause randomly between 7 and 20 seconds
    link = find_instance_links()
    print link