19 Data

19.1 MongoDB

in Mongo shell to check the location of your MongoDB files

db.adminCommand("getCmdLineOpts")

19.2 WRDS

Retrieve data from WRDS

# to set up connection from R to WRDS (https://wrds-www.wharton.upenn.edu/pages/support/programming-wrds/programming-r/r-from-your-computer/)
library(RPostgres)
library(dplyr)

# I've set up wrds connection before hand. Please use your username and password here.

# wrds <- dbConnect(Postgres(),
#                   host='wrds-pgdata.wharton.upenn.edu',
#                   port=9737,
#                   dbname='wrds',
#                   sslmode='require',
#                   user='')

# Check variables (column headers) in COMP ANNUAL FUNDAMENTAL
#uses the already-established wrds connection to prepare the SQL query string and save the query as the result res.
# check avaiable databases: https://wrds-web.wharton.upenn.edu/wrds/tools/variable.cfm?vendor_id=7
res <- dbSendQuery(wrds, "select column_name
                   from information_schema.columns
                   where table_schema='compa'
                   and table_name='funda'
                   order by column_name")
data <- dbFetch(res, n=-1) # fetches the data that results from running the query res against wrds and stores it as data
dbClearResult(res) # closes the connection
head(data)

# select everything
res <- dbSendQuery(wrds, "select * from compa.funda")

# from compa.funda

# only select the following variables
res <- dbSendQuery(wrds, "select gvkey, datadate, fyear, indfmt, consol, popsrc, datafmt, tic, cusip, conm, curcd, fyr, act, at, bkvlps, ceq, ch, che, dltt, dlc, emp, np, exchg, cik, costat, naicsh,mkvalt from compa.funda") #check variables from (https://wrds-web.wharton.upenn.edu/wrds/ds/comp/funda/index.cfm?navId=80)

# res <- dbSendQuery(wrds, "select gvkey, gp from compa.funda") #check variables from
data1 <- dbFetch(res, n=-1)
dbClearResult(res)

data = data1 %>%
    distinct(gvkey,datadate,fyear,tic,conm,.keep_all = T)

19.3 YouTube

19.3.1 OAuth

Go to link to access your API

library(tuber)
app_id = "YOUR APP ID"
app_secret = "YOUR APP SECRET"
yt_oauth(app_id, app_secret)


get_stats(video_id = "N708P-A45D0")

Note for Ubuntu user

httr::set_config(httr::config( ssl_verifypeer = 0L ) )

Get info about a video

get_video_details(video_id="N708P-A45D0")

Get caption of a video

get_captions(video_id="yJXTXN4xrI8")

Search Videos

res <- yt_search(term = "test")
head(res[, 1:3])

Get Comments of a video

res <- get_comment_threads(c(video_id="N708P-A45D0"))
head(res)

Get All the Comments Including Replies

get_all_comments(video_id = "a-UQz7fqR3w")

Get statistics of all the videos in a channel

a <- list_channel_resources(filter = c(channel_id = "UCT5Cx1l4IS3wHkJXNyuj4TA"), part="contentDetails")

# Uploaded playlists:
playlist_id <- a$items[[1]]$contentDetails$relatedPlaylists$uploads

# Get videos on the playlist
vids <- get_playlist_items(filter= c(playlist_id=playlist_id)) 

# Video ids
vid_ids <- as.vector(vids$contentDetails.videoId)

# Function to scrape stats for all vids
get_all_stats <- function(id) {
  get_stats(id)
} 

# Get stats and convert results to data frame 
res <- lapply(vid_ids, get_all_stats)
res_df <- do.call(rbind, lapply(res, data.frame))

head(res_df)

19.3.2 API

require(curl)
require(jsonlite)
library(kableExtra)
library(dplyr)
library(ggplot2)
library(plotly)
library(reshape2)

API_key = "YOUR API KEY"

getstats_video<-function(video_id,API_key){
 
  url=paste0("https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics&id=",video_id,"&key=",API_key)
  result <- fromJSON(txt=url)  
  salida=list()
  return(data.frame(name=result$items$snippet$channelTitle, result$items$statistics,title=result$items$snippet$title,date=result$items$snippet$publishedAt,descrip=result$items$snippet$description))
}

get_playlist_canal<-function(id,API_key,topn=15){
  
  url=paste0('https://www.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=',id,'&key=',API_key,'&maxResults=',topn)
  
  result=fromJSON(txt=url)
  return(data.frame(result$items$contentDetails))
}

getstats_canal<-function(id,API_key){

url=paste0('https://www.googleapis.com/youtube/v3/channels?part=snippet%2CcontentDetails%2Cstatistics&id=',id,'&key=',API_key)
  
result <- fromJSON(txt=url)   
return(data.frame(name=result$items$snippet$title,result$items$statistics,pl_list_id=result$items$contentDetails$relatedPlaylists))
  
}

getall_channels<-function(ids,API_key,topn=5){
  
  videos=lapply(ids,FUN=get_playlist_canal,API_key=API_key,topn=topn) %>% bind_rows()
  
  stats=lapply(videos[,1],FUN=getstats_video,API_key=API_key)
  stats=bind_rows(stats)
  stats$vid_id=videos[,1]
  return(stats)
}

Statistics per Channel

can_st=lapply(comp_data$cha_id,FUN = getstats_canal,API_key=API_key)
can_st=bind_rows(can_st)
can_st$viewCount=as.numeric(can_st$viewCount)
can_st[,1:6] %>% kable() %>%kable_styling()
can_st$viewCount=round(as.numeric(can_st$viewCount)/1000000,2)
p1=can_st %>% ggplot(aes(x=reorder(name,viewCount),y=viewCount,fill=name))+
geom_bar(stat="sum")+guides(size=F)+coord_flip()+scale_fill_manual(values = c("red", "darkblue", "yellow2"))+
geom_text(inherit.aes = T,aes(label=paste(viewCount,"M")),nudge_y =0,angle = 90)+
labs(x="Total Visualizations(Millions)",y="Visualizations",fill="")+
theme(legend.position = "top")+mytheme3
  
  ggplotly(p1,tooltip=c("name","viewCount"))  %>% 
    layout(legend = list(orientation = "h",x = 0.01, y = -0.1,autosize=F))  

Information on Individual Videos per Channel

var_to_see="dislikeCount" #favoriteCount or commentCount

info=getall_channels(ids = can_st$pl_list_id.uploads,API_key = API_key,topn =20)


 datacond=melt(info[,c(1:6,8)],id.vars = c("name","date"))
 datacond$date=as.Date(datacond$date)
 datacond$value=as.numeric(datacond$value)
    
    
ggplot(filter(datacond,variable==var_to_see),aes(x=as.Date(date),y=value,fill=name))+
geom_bar(stat="sum")+labs(x=var_to_see,y="",fill="")+guides(size=FALSE)+scale_fill_manual(values = c("red", "darkblue", "yellow2"))+theme(legend.position = "top")+
  scale_x_date(limits =as.Date(c(as.Date(min(datacond$date)),as.Date(Sys.time()))),date_breaks  ="month",date_labels="%b %y")+theme(legend.position = "top")+mytheme2 

19.3.3 Python

Pytube

# from pytube import YouTube
# YouTube('https://youtube.com/watch?v=2lAe1cqCOXo').streams.first().download()
import requests
import json
r = requests.get("http://gdata.youtube.com/feeds/api/standardfeeds/top_rated?v=2&alt=jsonc")
#r.text
data = json.loads(r.text)
data
# for item in data['data']['items']:
#   print "Video Title: %s" % (item['title'])
#   print "Video Category: %s" % (item['category'])
#   print "Video ID: %s" % (item['id'])
#   print "Video Rating: %f" % (item['rating'])
#   print "Embed URL: %s" % (item['player']['default'])

Using BeautifulSoup

from requests_html import HTMLSession 
from bs4 import BeautifulSoup as bs # importing BeautifulSoup

# sample youtube video url
video_url = "https://www.youtube.com/watch?v=jNQXAC9IVRw"
# init an HTML Session
session = HTMLSession()
# get the html content
response = session.get(video_url)
# execute Java-script
response.html.render(sleep=1)
# create bs object to parse HTML
soup = bs(response.html.html, "html.parser")
# write all HTML code into a file
open("video.html", "w", encoding='utf8').write(response.html.html)

Generate random video

import json
import urllib.request
import string
import random

count = 50
API_KEY = 'your_key'
random = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(3))

urlData = "https://www.googleapis.com/youtube/v3/search?key={}&maxResults={}&part=snippet&type=video&q={}".format(API_KEY,count,random)
webURL = urllib.request.urlopen(urlData)
data = webURL.read()
encoding = webURL.info().get_content_charset('utf-8')
results = json.loads(data.decode(encoding))

for data in results['items']:
    videoId = (data['id']['videoId'])
    print(videoId)
    #store your ids

Random used by YouTube API, test

# -*- coding: utf-8 -*-

# Sample Python code for youtube.search.list
# See instructions for running these code samples locally:
# https://developers.google.com/explorer-help/guides/code_samples#python

import os

import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors

scopes = ["https://www.googleapis.com/auth/youtube.force-ssl"]

def main():
    # Disable OAuthlib's HTTPS verification when running locally.
    # *DO NOT* leave this option enabled in production.
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

    api_service_name = "youtube"
    api_version = "v3"
    client_secrets_file = "YOUR_CLIENT_SECRET_FILE.json"

    # Get credentials and create an API client
    flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
        client_secrets_file, scopes)
    credentials = flow.run_console()
    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, credentials=credentials)

    request = youtube.search().list(
        part="snippet",
        maxResults=25,
        q="surfing"
    )
    response = request.execute()

    print(response)

if __name__ == "__main__":
    main()

Another way, but this is a dirty crawler, not for production

import re, urllib
from random import randint

def random_str(str_size):
    res = ""
    for i in xrange(str_size):
        x = randint(0,25)
        c = chr(ord('a')+x)
        res += c
    return res

def find_watch(text,pos):
    start = text.find("watch?v=",pos)
    if (start<0):
        return None,None
    end = text.find(" ",start)
    if (end<0):
        return None,None

    if (end-start > 200): #silly heuristics, probably not a must
        return None,None

    return text[start:end-1], start


def find_instance_links():
    base_url = 'https://www.youtube.com/results?search_query='
    url = base_url+random_str(3)
    #print url

    r = urllib.urlopen(url).read()

    links = {}

    pos = 0
    while True:
        link,pos =  find_watch(r,pos)
        if link == None or pos == None:
            break
        pos += 1
        #print link
        if (";" in link):
            continue
        links[link] = 1

    items_list = links.items()

    list_size = len(items_list)
    selected = randint(list_size/2,list_size-1)
    return items_list[selected][0]


for i in xrange(1000):
    sleep(randint(7,20)) # pause randomly between 7 and 20 seconds
    link = find_instance_links()
    print link

19.4 Consumer Expenditure

  • A consumer unit: “a single person, or group of person who live together and who share responsibilities for most major expenditures.”

  • The “Reference Person” is the first person named when the respondent is asked, “Who is responsible for owning or renting this home?”

  • 2 components to the Consumer Expenditure:

    • Interview survey

      • for “big ticket” and recurring expenditures, and global estimates on others
      • every three months for four quarter
      • rotating-panel survey
      • 6,000 consumer units each quarter
    • Diary survey

      • small-ticket, frequently purchased items

      • independent from the interview survey

      • about 14,000 diaries within families each year

  • Table format

    • annual

      • Jan to Dec: since 1984

      • July to June: since 2013

    • two-year (Jan year 1 - Dec year 2)

      • since 1986
  • Experimental research

    • all consumer units (available on request)

    • E.g., income tables, generational groups (e.g., millennial, gen X)

19.5 Gender, Age, Nationality

Now, we can make credible prediction about a person’s gender, age, and nationality based on his or her name.

The APIs are

  • https://genderize.io/
  • https://agify.io/
  • https://nationalize.io/

19.7 Baidu Index

see (P. J. Liu et al. 2019) on how to use Java-based spider to get search volume data