19 Data
19.1 MongoDB
in Mongo
shell to check the location of your MongoDB files
db.adminCommand("getCmdLineOpts")
19.2 WRDS
Retrieve data from WRDS
# to set up connection from R to WRDS (https://wrds-www.wharton.upenn.edu/pages/support/programming-wrds/programming-r/r-from-your-computer/)
library(RPostgres)
library(dplyr)
# I've set up wrds connection before hand. Please use your username and password here.
# wrds <- dbConnect(Postgres(),
# host='wrds-pgdata.wharton.upenn.edu',
# port=9737,
# dbname='wrds',
# sslmode='require',
# user='')
# Check variables (column headers) in COMP ANNUAL FUNDAMENTAL
#uses the already-established wrds connection to prepare the SQL query string and save the query as the result res.
# check avaiable databases: https://wrds-web.wharton.upenn.edu/wrds/tools/variable.cfm?vendor_id=7
res <- dbSendQuery(wrds, "select column_name
from information_schema.columns
where table_schema='compa'
and table_name='funda'
order by column_name")
data <- dbFetch(res, n=-1) # fetches the data that results from running the query res against wrds and stores it as data
dbClearResult(res) # closes the connection
head(data)
# select everything
res <- dbSendQuery(wrds, "select * from compa.funda")
# from compa.funda
# only select the following variables
res <- dbSendQuery(wrds, "select gvkey, datadate, fyear, indfmt, consol, popsrc, datafmt, tic, cusip, conm, curcd, fyr, act, at, bkvlps, ceq, ch, che, dltt, dlc, emp, np, exchg, cik, costat, naicsh,mkvalt from compa.funda") #check variables from (https://wrds-web.wharton.upenn.edu/wrds/ds/comp/funda/index.cfm?navId=80)
# res <- dbSendQuery(wrds, "select gvkey, gp from compa.funda") #check variables from
data1 <- dbFetch(res, n=-1)
dbClearResult(res)
data = data1 %>%
distinct(gvkey,datadate,fyear,tic,conm,.keep_all = T)
19.3 YouTube
-
YouTube Ads
- YouTube Analytics and Reporting APIs: All YouTube Analytics and YouTube Reporting API requests must be authorized by the channel or content owner that owns the requested data
19.3.1 OAuth
Go to link to access your API
library(tuber)
app_id = "YOUR APP ID"
app_secret = "YOUR APP SECRET"
yt_oauth(app_id, app_secret)
get_stats(video_id = "N708P-A45D0")
Note for Ubuntu user
httr::set_config(httr::config( ssl_verifypeer = 0L ) )
Get info about a video
get_video_details(video_id="N708P-A45D0")
Get caption of a video
get_captions(video_id="yJXTXN4xrI8")
Search Videos
res <- yt_search(term = "test")
head(res[, 1:3])
Get Comments of a video
Get All the Comments Including Replies
get_all_comments(video_id = "a-UQz7fqR3w")
Get statistics of all the videos in a channel
a <- list_channel_resources(filter = c(channel_id = "UCT5Cx1l4IS3wHkJXNyuj4TA"), part="contentDetails")
# Uploaded playlists:
playlist_id <- a$items[[1]]$contentDetails$relatedPlaylists$uploads
# Get videos on the playlist
vids <- get_playlist_items(filter= c(playlist_id=playlist_id))
# Video ids
vid_ids <- as.vector(vids$contentDetails.videoId)
# Function to scrape stats for all vids
get_all_stats <- function(id) {
get_stats(id)
}
# Get stats and convert results to data frame
res <- lapply(vid_ids, get_all_stats)
res_df <- do.call(rbind, lapply(res, data.frame))
head(res_df)
19.3.2 API
require(curl)
require(jsonlite)
library(kableExtra)
library(dplyr)
library(ggplot2)
library(plotly)
library(reshape2)
API_key = "YOUR API KEY"
getstats_video<-function(video_id,API_key){
url=paste0("https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics&id=",video_id,"&key=",API_key)
result <- fromJSON(txt=url)
salida=list()
return(data.frame(name=result$items$snippet$channelTitle, result$items$statistics,title=result$items$snippet$title,date=result$items$snippet$publishedAt,descrip=result$items$snippet$description))
}
get_playlist_canal<-function(id,API_key,topn=15){
url=paste0('https://www.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=',id,'&key=',API_key,'&maxResults=',topn)
result=fromJSON(txt=url)
return(data.frame(result$items$contentDetails))
}
getstats_canal<-function(id,API_key){
url=paste0('https://www.googleapis.com/youtube/v3/channels?part=snippet%2CcontentDetails%2Cstatistics&id=',id,'&key=',API_key)
result <- fromJSON(txt=url)
return(data.frame(name=result$items$snippet$title,result$items$statistics,pl_list_id=result$items$contentDetails$relatedPlaylists))
}
getall_channels<-function(ids,API_key,topn=5){
videos=lapply(ids,FUN=get_playlist_canal,API_key=API_key,topn=topn) %>% bind_rows()
stats=lapply(videos[,1],FUN=getstats_video,API_key=API_key)
stats=bind_rows(stats)
stats$vid_id=videos[,1]
return(stats)
}
Statistics per Channel
can_st=lapply(comp_data$cha_id,FUN = getstats_canal,API_key=API_key)
can_st=bind_rows(can_st)
can_st$viewCount=as.numeric(can_st$viewCount)
can_st[,1:6] %>% kable() %>%kable_styling()
can_st$viewCount=round(as.numeric(can_st$viewCount)/1000000,2)
p1=can_st %>% ggplot(aes(x=reorder(name,viewCount),y=viewCount,fill=name))+
geom_bar(stat="sum")+guides(size=F)+coord_flip()+scale_fill_manual(values = c("red", "darkblue", "yellow2"))+
geom_text(inherit.aes = T,aes(label=paste(viewCount,"M")),nudge_y =0,angle = 90)+
labs(x="Total Visualizations(Millions)",y="Visualizations",fill="")+
theme(legend.position = "top")+mytheme3
ggplotly(p1,tooltip=c("name","viewCount")) %>%
layout(legend = list(orientation = "h",x = 0.01, y = -0.1,autosize=F))
Information on Individual Videos per Channel
var_to_see="dislikeCount" #favoriteCount or commentCount
info=getall_channels(ids = can_st$pl_list_id.uploads,API_key = API_key,topn =20)
datacond=melt(info[,c(1:6,8)],id.vars = c("name","date"))
datacond$date=as.Date(datacond$date)
datacond$value=as.numeric(datacond$value)
ggplot(filter(datacond,variable==var_to_see),aes(x=as.Date(date),y=value,fill=name))+
geom_bar(stat="sum")+labs(x=var_to_see,y="",fill="")+guides(size=FALSE)+scale_fill_manual(values = c("red", "darkblue", "yellow2"))+theme(legend.position = "top")+
scale_x_date(limits =as.Date(c(as.Date(min(datacond$date)),as.Date(Sys.time()))),date_breaks ="month",date_labels="%b %y")+theme(legend.position = "top")+mytheme2
19.3.3 Python
# from pytube import YouTube
# YouTube('https://youtube.com/watch?v=2lAe1cqCOXo').streams.first().download()
import requests
import json
r = requests.get("http://gdata.youtube.com/feeds/api/standardfeeds/top_rated?v=2&alt=jsonc")
#r.text
data = json.loads(r.text)
data
# for item in data['data']['items']:
# print "Video Title: %s" % (item['title'])
# print "Video Category: %s" % (item['category'])
# print "Video ID: %s" % (item['id'])
# print "Video Rating: %f" % (item['rating'])
# print "Embed URL: %s" % (item['player']['default'])
Using BeautifulSoup
from requests_html import HTMLSession
from bs4 import BeautifulSoup as bs # importing BeautifulSoup
# sample youtube video url
video_url = "https://www.youtube.com/watch?v=jNQXAC9IVRw"
# init an HTML Session
session = HTMLSession()
# get the html content
response = session.get(video_url)
# execute Java-script
response.html.render(sleep=1)
# create bs object to parse HTML
soup = bs(response.html.html, "html.parser")
# write all HTML code into a file
open("video.html", "w", encoding='utf8').write(response.html.html)
Generate random video
import json
import urllib.request
import string
import random
count = 50
API_KEY = 'your_key'
random = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(3))
urlData = "https://www.googleapis.com/youtube/v3/search?key={}&maxResults={}&part=snippet&type=video&q={}".format(API_KEY,count,random)
webURL = urllib.request.urlopen(urlData)
data = webURL.read()
encoding = webURL.info().get_content_charset('utf-8')
results = json.loads(data.decode(encoding))
for data in results['items']:
videoId = (data['id']['videoId'])
print(videoId)
#store your ids
Random used by YouTube API, test
# -*- coding: utf-8 -*-
# Sample Python code for youtube.search.list
# See instructions for running these code samples locally:
# https://developers.google.com/explorer-help/guides/code_samples#python
import os
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
scopes = ["https://www.googleapis.com/auth/youtube.force-ssl"]
def main():
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
client_secrets_file = "YOUR_CLIENT_SECRET_FILE.json"
# Get credentials and create an API client
flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
client_secrets_file, scopes)
credentials = flow.run_console()
youtube = googleapiclient.discovery.build(
api_service_name, api_version, credentials=credentials)
request = youtube.search().list(
part="snippet",
maxResults=25,
q="surfing"
)
response = request.execute()
print(response)
if __name__ == "__main__":
main()
Another way, but this is a dirty crawler, not for production
import re, urllib
from random import randint
def random_str(str_size):
res = ""
for i in xrange(str_size):
x = randint(0,25)
c = chr(ord('a')+x)
res += c
return res
def find_watch(text,pos):
start = text.find("watch?v=",pos)
if (start<0):
return None,None
end = text.find(" ",start)
if (end<0):
return None,None
if (end-start > 200): #silly heuristics, probably not a must
return None,None
return text[start:end-1], start
def find_instance_links():
base_url = 'https://www.youtube.com/results?search_query='
url = base_url+random_str(3)
#print url
r = urllib.urlopen(url).read()
links = {}
pos = 0
while True:
link,pos = find_watch(r,pos)
if link == None or pos == None:
break
pos += 1
#print link
if (";" in link):
continue
links[link] = 1
items_list = links.items()
list_size = len(items_list)
selected = randint(list_size/2,list_size-1)
return items_list[selected][0]
for i in xrange(1000):
sleep(randint(7,20)) # pause randomly between 7 and 20 seconds
link = find_instance_links()
print link
19.4 Consumer Expenditure
A consumer unit: “a single person, or group of person who live together and who share responsibilities for most major expenditures.”
The “Reference Person” is the first person named when the respondent is asked, “Who is responsible for owning or renting this home?”
-
2 components to the Consumer Expenditure:
-
Interview survey
- for “big ticket” and recurring expenditures, and global estimates on others
- every three months for four quarter
- rotating-panel survey
- 6,000 consumer units each quarter
-
Diary survey
small-ticket, frequently purchased items
independent from the interview survey
about 14,000 diaries within families each year
-
-
Table format
-
annual
Jan to Dec: since 1984
July to June: since 2013
-
two-year (Jan year 1 - Dec year 2)
- since 1986
-
-
Experimental research
all consumer units (available on request)
E.g., income tables, generational groups (e.g., millennial, gen X)
19.5 Gender, Age, Nationality
Now, we can make credible prediction about a person’s gender, age, and nationality based on his or her name.
The APIs are
https://genderize.io/
https://agify.io/
https://nationalize.io/
19.6 Google Trends
Online search volume and news reporting show a strong correlation, while academic publishing lag behind online public interest due to its delayed review and publication process (Nghiem et al. 2016). This result was interpreted as news serves as a conductor between the research and the public community.
Google Trends only return
Data Granularity | Data Window |
---|---|
Hourly Data | Last 7 days |
Daily Data | Less than 9 months |
Weekly Data | between 9 months and 5 years |
Monthly Data | Longer than 5 years |
Since the data is indexed for the chosen time window, for the same keywords with different time windows might result in different index values.
19.6.1 Relative Search
library(gtrendsR)
# for proxy
# setHandleParameters(
# user = "xxxx",
# password = "*******",
# domain = "mydomain",
# proxyhost = "10.111.124.113",
# proxyport = 8080
# )
keywords = c("7eleven", "3m")
country = c("US")
time = ("2010-01-01 2012-01-30") # earliest is 2004
channel = "web"
trends = gtrends(keyword = keywords, geo = country, time = time, gprop = channel)
# objects
names(trends)
# this is weekly data only
time_trend <- trends$interest_over_time
The value is relative to the maximum volume (not absolute search volume)
channel type (“web” (default), “news”, “images”, “froogle” , “youtube”)
library(ggplot2)
plot <-
ggplot(data = time_trend, aes(
x = date,
y = hits,
group = keyword,
col = keyword
)) +
geom_line() + xlab('Time') + ylab('Relative Interest (weekly)') + theme_bw() +
theme(
legend.title = element_blank(),
legend.position = "bottom",
legend.text = element_text(size = 12)
) + ggtitle("Google Search Volume")
plot
Smoothing to remove seasonality
plot <-
ggplot(data = time_trend, aes(
x = date,
y = hits,
group = keyword,
col = keyword
)) +
geom_smooth(span = 0.5, se = FALSE) + xlab('Time') + ylab('Relative Interest') +
theme_bw() + theme(
legend.title = element_blank(),
legend.position = "bottom",
legend.text = element_text(size = 12)
) + ggtitle("Google Search Volume")
plot
Alternatively, we can use the plot function in the gtrendsR
package readily
- Scaling method (overlapping method) (recommended)
A way to get daily data from gtrendsR
readily is proposed by Alex Dyachenko
Get daily estimates for the window less than 9 months
Get monthly estimates for your desired time frame
Multiply daily estimates for each month from step 1 by their weights from step 2
Concatenation method (normalization/dailydata method)
Daily data are concatenated from 1-month queries and normalized by weekly trends data, which has been done in this post
From this post, we can be sure that the scaling method is better than the normalization method
Rate limit:
- 1,400 requests in 4 hours
library(gtrendsR)
library(tidyverse)
library(lubridate)
get_daily_gtrend <-
function(keyword = c('7eleven', '3M'),
geo = 'US',
from = '2013-01-01',
to = '2013-02-15') {
if (ymd(to) >= floor_date(Sys.Date(), 'month')) {
to <- floor_date(ymd(to), 'month') - days(1)
if (to < from) {
stop("Specifying \'to\' date in the current month is not allowed")
}
}
aggregated_data <-
gtrends(keyword = keyword,
geo = geo,
time = paste(from, to))
if (is.null(aggregated_data$interest_over_time)) {
print('There is no data in Google Trends!')
return()
}
mult_m <- aggregated_data$interest_over_time %>%
mutate(hits = as.integer(ifelse(hits == '<1', '0', hits))) %>%
group_by(month = floor_date(date, 'month'), keyword) %>%
summarise(hits = sum(hits)) %>%
ungroup() %>%
mutate(ym = format(month, '%Y-%m'),
mult = hits / max(hits)) %>%
select(month, ym, keyword, mult) %>%
as_tibble()
pm <- tibble(
s = seq(ymd(from), ymd(to), by = 'month'),
e = seq(ymd(from), ymd(to), by = 'month') + months(1) - days(1)
)
raw_trends_m <- tibble()
for (i in seq(1, nrow(pm), 1)) {
curr <- gtrends(keyword,
geo = geo,
time = paste(pm$s[i], pm$e[i]))
if (is.null(curr$interest_over_time))
next
print(paste(
'for',
pm$s[i],
pm$e[i],
'retrieved',
count(curr$interest_over_time),
'days of data (all keywords)'
))
raw_trends_m <- rbind(raw_trends_m,
curr$interest_over_time)
}
trend_m <- raw_trends_m %>%
select(date, keyword, hits) %>%
mutate(ym = format(date, '%Y-%m'),
hits = as.integer(ifelse(hits == '<1', '0', hits))) %>%
as_tibble()
trend_res <- trend_m %>%
left_join(mult_m) %>%
mutate(est_hits = hits * mult) %>%
select(date, keyword, est_hits) %>%
as_tibble() %>%
mutate(date = as.Date(date))
return(trend_res)
}
daily_trend <- get_daily_gtrend(
keyword = c('7eleven', '3M'),
geo = 'US',
from = '2013-01-01',
to = '2013-02-01'
)
head(daily_trend)
This method was used in a research paper (Risteski and Davcev 2014)
Similarly, without the gtrendsR
package, you can follow Erik Johansson’s method, but the Google no longer has his URL: http://www.google.com/trends/trendsReport?hl=en-US&q= so you might have to figure out the new URL
Other methods include:
19.6.2 Absolute Search
See Google Trend Dataset available via BigQuery
19.7 Baidu Index
see (P. J. Liu et al. 2019) on how to use Java-based spider to get search volume data