The GDELT Project

A By-Station Visual Chronology Of The Top Visual Themes Of Television Evening News 2010-2019 Using Cloud Video + BigQuery

Using the new visual entity graph constructed by having the Cloud Video API watch a decade of evening news broadcasts on ABC, CBS and NBC in the Internet Archive's Television News Archive, what can we learn about the most significant visual themes of each evening? Using a single SQL query in BigQuery we can use a simple TFIDF calculation to surface the top 10 most meaningful visual labels of each evening's coverage on each station.

For example, on the evenings of November 19 and 20, 2019, panda imagery was featured at a much higher rate than normal, reflecting giant panda Bei Bei's departure from the US National Zoo to return to China.

The final chronologies can be downloaded below, one reflecting the combined imagery of the three stations and the other breaking the imagery down by station. In the by-station spreadsheet, KGO is the local ABC affiliate, KPIX is the local CBS affiliate and KNTV is the local NBC affiliate.

 

TECHNICAL DETAILS

Constructing the spreadsheet above took only a simple SQL query, modified from the original by Felipe Hoffa.

WITH nested AS (

WITH data AS (
SELECT TIMESTAMP_TRUNC(date, DAY) date, entities.name word, entities.numSeconds count FROM `gdelt-bq.gdeltv2.vgeg_iatv`, unnest(entities) entities
)
, word_day_type AS (
# how many times a word is mentioned in each "document"
SELECT word, SUM(count) counts, date
FROM data
GROUP BY 1, 3
)
, day_type AS (
# total # of words in each "document"
SELECT SUM(count) counts, date
FROM data
GROUP BY 2
)
, tf AS (
# TF for a word in a "document"
SELECT word, date, a.counts/b.counts tf
FROM word_day_type a
JOIN day_type b
USING(date)
)
, word_in_docs AS (
# how many "documents" have a word
SELECT word, COUNT(DISTINCT FORMAT('%s', FORMAT_TIMESTAMP( "%m/%d/%E4Y", date))) indocs
FROM word_day_type
GROUP BY 1
)
, total_docs AS (
# total # of docs
SELECT COUNT(DISTINCT FORMAT('%s', FORMAT_TIMESTAMP( "%m/%d/%E4Y", date))) total_docs
FROM data
)
, idf AS (
# IDF for a word
SELECT word, LOG(total_docs.total_docs/indocs) idf
FROM word_in_docs
CROSS JOIN total_docs
)

SELECT date,
ARRAY_AGG(STRUCT(ARRAY_TO_STRING(words, ', ') AS wordlist)) top_words
FROM (
SELECT date, ARRAY_AGG(word ORDER BY tfidf DESC LIMIT 10) words
FROM (
SELECT word, date, tf.tf * idf.idf tfidf
FROM tf
JOIN idf
USING(word)
)
GROUP BY date
)
GROUP BY date
ORDER BY date DESC

) select FORMAT_TIMESTAMP( "%m/%d/%E4Y", date) day,wordlist from nested, UNNEST(top_words) order by date asc

Breaking the chronology down by station required just adding the station field to the group by's.

WITH nested AS (

WITH data AS (
SELECT TIMESTAMP_TRUNC(date, DAY) date, station, entities.name word, entities.numSeconds count FROM `gdelt-bq.gdeltv2.vgeg_iatv`, unnest(entities) entities
)
, word_day_type AS (
# how many times a word is mentioned in each "document"
SELECT word, SUM(count) counts, date, station
FROM data
GROUP BY 1, 3, 4
)
, day_type AS (
# total # of words in each "document"
SELECT SUM(count) counts, date, station
FROM data
GROUP BY 2, 3
)
, tf AS (
# TF for a word in a "document"
SELECT word, date, station, a.counts/b.counts tf
FROM word_day_type a
JOIN day_type b
USING(date, station)
)
, word_in_docs AS (
# how many "documents" have a word
SELECT word, COUNT(DISTINCT FORMAT('%s %s', FORMAT_TIMESTAMP( "%m/%d/%E4Y", date), station)) indocs
FROM word_day_type
GROUP BY 1
)
, total_docs AS (
# total # of docs
SELECT COUNT(DISTINCT FORMAT('%s %s', FORMAT_TIMESTAMP( "%m/%d/%E4Y", date), station)) total_docs
FROM data
)
, idf AS (
# IDF for a word
SELECT word, LOG(total_docs.total_docs/indocs) idf
FROM word_in_docs
CROSS JOIN total_docs
)

SELECT date, station,
ARRAY_AGG(STRUCT(ARRAY_TO_STRING(words, ', ') AS wordlist)) top_words
FROM (
SELECT date, station, ARRAY_AGG(word ORDER BY tfidf DESC LIMIT 10) words
FROM (
SELECT word, date, station, tf.tf * idf.idf tfidf
FROM tf
JOIN idf
USING(word)
)
GROUP BY date, station
)
GROUP BY date, station
ORDER BY date DESC

) select FORMAT_TIMESTAMP( "%m/%d/%E4Y", date) day, station, wordlist from nested, UNNEST(top_words) order by date asc, station asc