17 Data
17.1 MongoDB
in Mongo
shell to check the location of your MongoDB files
db.adminCommand("getCmdLineOpts")
17.2 WRDS
Retrieve data from WRDS
## Warning: package 'RPostgres' was built under R version 4.0.5
# to set up connection from R to WRDS (https://wrds-www.wharton.upenn.edu/pages/support/programming-wrds/programming-r/r-from-your-computer/)
library(RPostgres)
library(dplyr)
# I've set up wrds connection before hand. Please use your username and password here.
# wrds <- dbConnect(Postgres(),
# host='wrds-pgdata.wharton.upenn.edu',
# port=9737,
# dbname='wrds',
# sslmode='require',
# user='')
# Check variables (column headers) in COMP ANNUAL FUNDAMENTAL
#uses the already-established wrds connection to prepare the SQL query string and save the query as the result res.
# check avaiable databases: https://wrds-web.wharton.upenn.edu/wrds/tools/variable.cfm?vendor_id=7
res <- dbSendQuery(wrds, "select column_name
from information_schema.columns
where table_schema='compa'
and table_name='funda'
order by column_name")
data <- dbFetch(res, n=-1) # fetches the data that results from running the query res against wrds and stores it as data
dbClearResult(res) # closes the connection
head(data)
## column_name
## 1 acchg
## 2 acco
## 3 accrt
## 4 acctchg
## 5 acctstd
## 6 acdo
# select everything
res <- dbSendQuery(wrds, "select * from compa.funda")
# from compa.funda
# only select the following variables
res <- dbSendQuery(wrds, "select gvkey, datadate, fyear, indfmt, consol, popsrc, datafmt, tic, cusip, conm, curcd, fyr, act, at, bkvlps, ceq, ch, che, dltt, dlc, emp, np, exchg, cik, costat, naicsh,mkvalt from compa.funda") #check variables from (https://wrds-web.wharton.upenn.edu/wrds/ds/comp/funda/index.cfm?navId=80)
## Warning in result_create(conn@ptr, statement, immediate): Closing open result
## set, cancelling previous query
17.3 YouTube
-
YouTube Ads
- YouTube Analytics and Reporting APIs: All YouTube Analytics and YouTube Reporting API requests must be authorized by the channel or content owner that owns the requested data
17.3.1 OAuth
Go to link to access your API
library(tuber)
app_id = "YOUR APP ID"
app_secret = "YOUR APP SECRET"
yt_oauth(app_id, app_secret)
get_stats(video_id = "N708P-A45D0")
Note for Ubuntu user
httr::set_config(httr::config( ssl_verifypeer = 0L ) )
Get info about a video
get_video_details(video_id="N708P-A45D0")
Get caption of a video
get_captions(video_id="yJXTXN4xrI8")
Search Videos
Get Comments of a video
res <- get_comment_threads(c(video_id="N708P-A45D0"))
head(res)
Get All the Comments Including Replies
get_all_comments(video_id = "a-UQz7fqR3w")
Get statistics of all the videos in a channel
a <- list_channel_resources(filter = c(channel_id = "UCT5Cx1l4IS3wHkJXNyuj4TA"), part="contentDetails")
# Uploaded playlists:
playlist_id <- a$items[[1]]$contentDetails$relatedPlaylists$uploads
# Get videos on the playlist
vids <- get_playlist_items(filter= c(playlist_id=playlist_id))
# Video ids
vid_ids <- as.vector(vids$contentDetails.videoId)
# Function to scrape stats for all vids
get_all_stats <- function(id) {
get_stats(id)
}
# Get stats and convert results to data frame
res <- lapply(vid_ids, get_all_stats)
res_df <- do.call(rbind, lapply(res, data.frame))
head(res_df)
17.3.2 API
require(curl)
require(jsonlite)
library(kableExtra)
library(dplyr)
library(ggplot2)
library(plotly)
library(reshape2)
API_key = "YOUR API KEY"
getstats_video<-function(video_id,API_key){
url=paste0("https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics&id=",video_id,"&key=",API_key)
result <- fromJSON(txt=url)
salida=list()
return(data.frame(name=result$items$snippet$channelTitle, result$items$statistics,title=result$items$snippet$title,date=result$items$snippet$publishedAt,descrip=result$items$snippet$description))
}
get_playlist_canal<-function(id,API_key,topn=15){
url=paste0('https://www.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=',id,'&key=',API_key,'&maxResults=',topn)
result=fromJSON(txt=url)
return(data.frame(result$items$contentDetails))
}
getstats_canal<-function(id,API_key){
url=paste0('https://www.googleapis.com/youtube/v3/channels?part=snippet%2CcontentDetails%2Cstatistics&id=',id,'&key=',API_key)
result <- fromJSON(txt=url)
return(data.frame(name=result$items$snippet$title,result$items$statistics,pl_list_id=result$items$contentDetails$relatedPlaylists))
}
getall_channels<-function(ids,API_key,topn=5){
videos=lapply(ids,FUN=get_playlist_canal,API_key=API_key,topn=topn) %>% bind_rows()
stats=lapply(videos[,1],FUN=getstats_video,API_key=API_key)
stats=bind_rows(stats)
stats$vid_id=videos[,1]
return(stats)
}
Statistics per Channel
can_st=lapply(comp_data$cha_id,FUN = getstats_canal,API_key=API_key)
can_st=bind_rows(can_st)
can_st$viewCount=as.numeric(can_st$viewCount)
can_st[,1:6] %>% kable() %>%kable_styling()
can_st$viewCount=round(as.numeric(can_st$viewCount)/1000000,2)
p1=can_st %>% ggplot(aes(x=reorder(name,viewCount),y=viewCount,fill=name))+
geom_bar(stat="sum")+guides(size=F)+coord_flip()+scale_fill_manual(values = c("red", "darkblue", "yellow2"))+
geom_text(inherit.aes = T,aes(label=paste(viewCount,"M")),nudge_y =0,angle = 90)+
labs(x="Total Visualizations(Millions)",y="Visualizations",fill="")+
theme(legend.position = "top")+mytheme3
ggplotly(p1,tooltip=c("name","viewCount")) %>%
layout(legend = list(orientation = "h",x = 0.01, y = -0.1,autosize=F))
Information on Individual Videos per Channel
var_to_see="dislikeCount" #favoriteCount or commentCount
info=getall_channels(ids = can_st$pl_list_id.uploads,API_key = API_key,topn =20)
datacond=melt(info[,c(1:6,8)],id.vars = c("name","date"))
datacond$date=as.Date(datacond$date)
datacond$value=as.numeric(datacond$value)
ggplot(filter(datacond,variable==var_to_see),aes(x=as.Date(date),y=value,fill=name))+
geom_bar(stat="sum")+labs(x=var_to_see,y="",fill="")+guides(size=FALSE)+scale_fill_manual(values = c("red", "darkblue", "yellow2"))+theme(legend.position = "top")+
scale_x_date(limits =as.Date(c(as.Date(min(datacond$date)),as.Date(Sys.time()))),date_breaks ="month",date_labels="%b %y")+theme(legend.position = "top")+mytheme2
17.3.3 Python
# from pytube import YouTube
# YouTube('https://youtube.com/watch?v=2lAe1cqCOXo').streams.first().download()
import requests
import json
= requests.get("http://gdata.youtube.com/feeds/api/standardfeeds/top_rated?v=2&alt=jsonc")
r #r.text
= json.loads(r.text)
data
data# for item in data['data']['items']:
# print "Video Title: %s" % (item['title'])
# print "Video Category: %s" % (item['category'])
# print "Video ID: %s" % (item['id'])
# print "Video Rating: %f" % (item['rating'])
# print "Embed URL: %s" % (item['player']['default'])
Using BeautifulSoup
from requests_html import HTMLSession
from bs4 import BeautifulSoup as bs # importing BeautifulSoup
# sample youtube video url
= "https://www.youtube.com/watch?v=jNQXAC9IVRw"
video_url # init an HTML Session
= HTMLSession()
session # get the html content
= session.get(video_url)
response # execute Java-script
=1)
response.html.render(sleep# create bs object to parse HTML
= bs(response.html.html, "html.parser")
soup # write all HTML code into a file
open("video.html", "w", encoding='utf8').write(response.html.html)
Generate random video
import json
import urllib.request
import string
import random
= 50
count = 'your_key'
API_KEY = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(3))
random
= "https://www.googleapis.com/youtube/v3/search?key={}&maxResults={}&part=snippet&type=video&q={}".format(API_KEY,count,random)
urlData = urllib.request.urlopen(urlData)
webURL = webURL.read()
data = webURL.info().get_content_charset('utf-8')
encoding = json.loads(data.decode(encoding))
results
for data in results['items']:
= (data['id']['videoId'])
videoId print(videoId)
#store your ids
Random used by YouTube API, test
# -*- coding: utf-8 -*-
# Sample Python code for youtube.search.list
# See instructions for running these code samples locally:
# https://developers.google.com/explorer-help/guides/code_samples#python
import os
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
= ["https://www.googleapis.com/auth/youtube.force-ssl"]
scopes
def main():
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
"OAUTHLIB_INSECURE_TRANSPORT"] = "1"
os.environ[
= "youtube"
api_service_name = "v3"
api_version = "YOUR_CLIENT_SECRET_FILE.json"
client_secrets_file
# Get credentials and create an API client
= google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
flow
client_secrets_file, scopes)= flow.run_console()
credentials = googleapiclient.discovery.build(
youtube =credentials)
api_service_name, api_version, credentials
= youtube.search().list(
request ="snippet",
part=25,
maxResults="surfing"
q
)= request.execute()
response
print(response)
if __name__ == "__main__":
main()
Another way, but this is a dirty crawler, not for production
import re, urllib
from random import randint
def random_str(str_size):
= ""
res for i in xrange(str_size):
= randint(0,25)
x = chr(ord('a')+x)
c += c
res return res
def find_watch(text,pos):
= text.find("watch?v=",pos)
start if (start<0):
return None,None
= text.find(" ",start)
end if (end<0):
return None,None
if (end-start > 200): #silly heuristics, probably not a must
return None,None
return text[start:end-1], start
def find_instance_links():
= 'https://www.youtube.com/results?search_query='
base_url = base_url+random_str(3)
url #print url
= urllib.urlopen(url).read()
r
= {}
links
= 0
pos while True:
= find_watch(r,pos)
link,pos if link == None or pos == None:
break
+= 1
pos #print link
if (";" in link):
continue
= 1
links[link]
= links.items()
items_list
= len(items_list)
list_size = randint(list_size/2,list_size-1)
selected return items_list[selected][0]
for i in xrange(1000):
7,20)) # pause randomly between 7 and 20 seconds
sleep(randint(= find_instance_links()
link print link
17.4 Consumer Expenditure
A consumer unit: “a single person, or group of person who live together and who share responsibilities for most major expenditures.”
The “Reference Person” is the first person named when the respondent is asked, “Who is responsible for owning or renting this home?”
-
2 components to the Consumer Expenditure:
-
Interview survey
- for “big ticket” and recurring expenditures, and global estimates on others
- every three months for four quarter
- rotating-panel survey
- 6,000 consumer units each quarter
-
Diary survey
small-ticket, frequently purchased items
independent from the interview survey
about 14,000 diaries within families each year
-
-
Table format
-
annual
Jan to Dec: since 1984
July to June: since 2013
-
two-year (Jan year 1 - Dec year 2)
- since 1986
-
-
Experimental research
all consumer units (available on request)
E.g., income tables, generational groups (e.g., millennial, gen X)