# TikTok imports

In [1]:
import pandas as pd
import time
from tqdm.auto import tqdm

import requests
from datetime import datetime

In [2]:
!pip install ensembledata
from ensembledata.api import EDClient

Collecting ensembledata
  Downloading ensembledata-0.2.3-py3-none-any.whl.metadata (6.1 kB)
Downloading ensembledata-0.2.3-py3-none-any.whl (12 kB)
Installing collected packages: ensembledata
Successfully installed ensembledata-0.2.3


In [3]:
# initialize the ensembledata client
client = EDClient("2BJVoG42lL9XLjlU")

# get a free token at https://dashboard.ensembledata.com/register
# free trial of 50 units per day for 7 days, starting 23/11/24
#
# costs:
#  - 1 unit = 200 posts in full_hashtag_search :)
#  - 1 unit = 20 posts in hashtag_search
#  - 1 unit = 20 posts in keyword_search or full_keyword_search :(
#  - 1 unit = 30 comments to a post
#  - 1 unit = 30 replies to a comment
#
# documentation available at https://ensembledata.com/apis/docs#tag/Tiktok
# and https://github.com/EnsembleData/tiktok-scraper
# source code available at
# https://github.com/EnsembleData/ensembledata-python/tree/main/ensembledata/api

# Read posts from TikTok

Hashtag search (200 posts at the cost of 1 unit)

In [9]:
# read posts related to an hashtag
result = client.tiktok.full_hashtag_search(
                hashtag = "rome", # hashtag name
                days = 360, # filter out posts created more than N days ago
                remap_output = True, # False for mobile app, True for web
                max_cursor = 200) # max number of posts fetched,
                                  # cost is 1 unit per 200 posts

In [14]:
# convert json to dataframe
df = pd.json_normalize(result.data["posts"])

# extract relevant info
df = df[['itemInfos.id', 'itemInfos.createTime', 'itemInfos.authorId',
         'itemInfos.text', 'itemInfos.video.urls', 'itemInfos.shareCount',
         'itemInfos.playCount', 'itemInfos.commentCount']]

# rename columns
df.columns = ['id', 'createTime', 'authorId', 'text', 'video.urls',
              'shareCount', 'playCount', 'commentCount']

# format time to date in string format
df["createTime"] = [str(datetime.fromtimestamp(int(i)).date())
                                                for i in df["createTime"]]

# save
df_hashtag = df
df_hashtag.to_csv('rome__tiktok_hashtag_posts.csv', index=False)

In [15]:
display(df_hashtag)

Unnamed: 0,id,createTime,authorId,text,video.urls,shareCount,playCount,commentCount
0,7359267896134044960,2024-04-18,6993424758629123077,I am living in my dream ‚ù§Ô∏è‚Äçüî•‚ú®üòç #roma #rome #it...,[https://v77.tiktokcdn-eu.com/8d78f10eb75f4138...,63365,5288048,2053
1,7379659891151047968,2024-06-12,6993424758629123077,La capitale d'Italia üáÆüáπ #roma #rome #italy #it...,[https://v77.tiktokcdn-eu.com/ed963c38f39f80d7...,40073,8405111,6592
2,7435261141170785569,2024-11-09,15534388,#rome #romeitaly #trevifountain,[https://v77.tiktokcdn-eu.com/12bc6b2c6709a837...,10196,1831089,443
3,7352900587245014305,2024-04-01,54711960121925632,Rome üß° #fyp #tiktoktravel #rome #romeitaly #ro...,[https://v77.tiktokcdn-eu.com/4303f977c670ebf3...,217342,12185857,6623
4,7430530232064412961,2024-10-27,24722567,Pov: visiti Roma nel perido sbagliato #giubile...,[https://v77.tiktokcdn-eu.com/fd321985713cf518...,15788,2060491,999
...,...,...,...,...,...,...,...,...
111,7431923984209366304,2024-10-31,6640844595436371974,ü§∑üèæ‚Äç‚ôÄÔ∏è sorry someone had to say it! ü´£ Here are ...,[https://v45.tiktokcdn-eu.com/fb50dc732aca9d85...,205,32559,19
112,7438022027518283030,2024-11-16,7197067321264440325,just to see ancient Rome #rome#italy#ancientro...,[https://v45.tiktokcdn-eu.com/282d677d9e90fb33...,428,47569,23
113,7438302279083511072,2024-11-17,7412702609633854496,The vatican üáÆüáπ #vaticano #rome #italy #fyp,[https://v45.tiktokcdn-eu.com/d8c619f2698f76aa...,46,10228,1
114,7420842321752771847,2024-10-01,6875838408650081282,Would you wait in line for this mirror? üìç Chie...,[https://v45.tiktokcdn-eu.com/f3b1c1935cf936ea...,11868,966840,170


Keyword search (100 posts at the cost of 5 units) - limited to 180 days!

In [32]:
# read posts related to a keyword
keyword = "visit rome" # keyword name
period = 180 # must be one of 0, 1, 7, 30, 90, 180 days
             # returns posts more recent than the given period
country = "us" # leave blank "" for any country
max_cursor = 100 # max number of posts fetched, cost is 1 unit per 20 posts

# loop through keyword_search to control for max_cursor
posts = list()
cursor = None
for i in tqdm(range(int(max_cursor/20))):
    result = client.tiktok.keyword_search(keyword = keyword,
                                          period = period,
                                          country = country,
                                          cursor = cursor)
    posts.extend(result.data["data"])
    cursor = result.data.get("nextCursor")

    # If there is no next cursor, we've fetched all the available posts
    if cursor is None:
        break

  0%|          | 0/5 [00:00<?, ?it/s]

In [33]:
# convert json to dataframe
df = pd.json_normalize(posts)

# extract relevant info
df = df[['aweme_info.aweme_id', 'aweme_info.create_time',
         'aweme_info.author.uid', 'aweme_info.desc',
         'aweme_info.video.play_addr.url_list',
         'aweme_info.statistics.share_count',
         'aweme_info.statistics.play_count',
         'aweme_info.statistics.comment_count']]

# rename columns
df.columns = ['id', 'createTime', 'authorId', 'text', 'video.urls',
              'shareCount', 'playCount', 'commentCount']

# format time to date in string format
df["createTime"] = [str(datetime.fromtimestamp(int(i)).date())
                                                for i in df["createTime"]]

# save
df_keyword = df
df_keyword.to_csv('rome_tiktok_keyword_posts.csv', index=False)

In [35]:
display(df_keyword)

Unnamed: 0,id,createTime,authorId,text,video.urls,shareCount,playCount,commentCount
0,7440056847337377056,2024-11-22,6952787730524079109,What do you think? Comment below ü§îüëáüèª #rome #i...,[https://v58.tiktokcdn-eu.com/video/tos/useast...,7,2182,21
1,7374823519336664353,2024-05-30,6946260180691829762,üáÆüáπ 9 Best things to do in Rome‚Å£‚Å£ ‚¨áÔ∏è ‚Å£‚Å£ üõµ 1. Ve...,[https://v58.tiktokcdn-eu.com/video/tos/useast...,11818,1049530,161
2,7435983837500312840,2024-11-11,7343426956786992146,First time in Rome üî®üß±üî©üîßüößüèóÔ∏è most of the popular...,[https://v58.tiktokcdn-eu.com/video/tos/alisg/...,83,12121,16
3,7438717202817109280,2024-11-18,6732901876448936965,My first time in Rome and everything was under...,[https://v58.tiktokcdn-eu.com/video/tos/useast...,2,726,3
4,7440569693401877782,2024-11-23,7303282071206757408,help please ü•≤ #italy #rome #Rome #romeitaly #r...,[https://v58.tiktokcdn-eu.com/video/tos/no1a/t...,1,888,2
...,...,...,...,...,...,...,...,...
92,7440707400799210795,2024-11-24,7157473428814054443,Euro Summer 2024 #travel #greece #rome #paris ...,[https://v77.tiktokcdn-eu.com/82e7862373b3b2ec...,0,21,0
93,7439425407079796001,2024-11-20,6837200889528206342,Want to discover Rome with us? Visit us at the...,[https://v77.tiktokcdn-eu.com/a1c6710c6c6e13bf...,58,298532,86
94,7377874415004208430,2024-06-07,6722584075166614534,Had to carry our luggage up 5 flights of stair...,[https://v77.tiktokcdn-eu.com/a11e7694243209b1...,3291,527864,1084
95,7438926093886836000,2024-11-19,6809716996256629766,3 days in Rome üáÆüáπ Here‚Äôs our itinerary of the ...,[https://v77.tiktokcdn-eu.com/6dbe37bbe0ad4080...,2,673,2


# Read comments from TikTok

Comments search

In [43]:
df_hashtag = pd.read_csv('rome_tiktok_hashtag_posts.csv')
df_keyword = pd.read_csv('rome_tiktok_keyword_posts.csv')

# order by comment count
df_hashtag = df_hashtag.sort_values(by=['commentCount'], ascending=False).reset_index(drop=True)
df_keyword = df_keyword.sort_values(by=['commentCount'], ascending=False).reset_index(drop=True)
# delete posts with less than 30 comments
df_hashtag = df_hashtag[df_hashtag['commentCount']>=30]
df_keyword = df_keyword[df_keyword['commentCount']>=30]
# state how many
print(f'{len(df_hashtag)} posts for hastags with more than 30 comments')
print(f'{len(df_keyword)} posts for keywords with more than 30 comments')

69 posts for hastags with more than 30 comments
54 posts for keywords with more than 30 comments


In [49]:
display(df_hashtag)

Unnamed: 0,id,createTime,authorId,text,video.urls,shareCount,playCount,commentCount
0,7349672610927529249,2024-03-23,6801498920364377093,Rome ü´∂üèºüáÆüáπ #rome #romeitaly #italytravel #euros...,['https://v77.tiktokcdn-eu.com/a95ac09c74d3913...,162339,9309494,7114
1,7352900587245014305,2024-04-01,54711960121925632,Rome üß° #fyp #tiktoktravel #rome #romeitaly #ro...,['https://v77.tiktokcdn-eu.com/4303f977c670ebf...,217342,12185857,6623
2,7379659891151047968,2024-06-12,6993424758629123077,La capitale d'Italia üáÆüáπ #roma #rome #italy #it...,['https://v77.tiktokcdn-eu.com/ed963c38f39f80d...,40073,8405111,6592
3,7318787567552318752,2023-12-31,7045849422799275013,STOP ROMANTICIZING ITALY. #italy #rome #italian,['https://v77.tiktokcdn-eu.com/108bc85c129140b...,10604,5092228,6362
4,7416287833528208673,2024-09-19,7090477710006322181,"L'**Altare della Patria**, situato nel cuore d...",['https://v77.tiktokcdn-eu.com/dce945b920b4772...,26466,4669982,4188
...,...,...,...,...,...,...,...,...
64,7388780788520684806,2024-07-07,6906593546226402310,Rome is still a beautiful place to visit! Don‚Äô...,['https://v15m.tiktokcdn-eu.com/457c28e11ba3a3...,755,124599,36
65,7421961779326045473,2024-10-04,7186645069767033861,"October in Rome, Italy üáÆüáπüçÇüéÉ Are you visiting ...",['https://v77.tiktokcdn-eu.com/93618265a1f48a8...,477,84173,35
66,7402265058476166432,2024-08-12,7166665197695943686,No wonder Rome is called the Eternal Cityüï∞Ô∏è ...,['https://v77.tiktokcdn-eu.com/27d70eba3a61168...,872,158881,34
67,7309915830089682208,2023-12-07,7054875590512739334,How cold is it in Rome in December? ü•∂ #carpedi...,['https://v15m.tiktokcdn-eu.com/d671732bafaca8...,952,172359,33


In [45]:
# set search criteria
comments_to_retrieve = 60 # 30 is a safe number = 1 credit!

comments = list()
for i in range(2): # change the range to get more!
    aweme_id = df_hashtag["id"][i]
    cursor = None
    for _ in range(int(comments_to_retrieve/30)):
        result = client.tiktok.post_comments(
            aweme_id = aweme_id,
            cursor = cursor
        )
        comments.extend(result.data["comments"])
        cursor = result.data.get("nextCursor")
        if cursor is None:
            break

In [46]:
# convert json to dataframe
df = pd.json_normalize(comments)

# extract relevant info
df = df[['aweme_id', 'cid', 'create_time', 'text', 'reply_comment_total']]

# format time to date in string format
df["create_time"] = [str(datetime.fromtimestamp(int(i)).date())
                                                for i in df["create_time"]]

# order by number of replies
df = df.sort_values(by=['reply_comment_total'], ascending=False).reset_index(drop=True)

# save
df_comments = df
df_comments.to_csv('rome_tiktok_hashtag_comments.csv', index=False)

In [48]:
display(df_comments)

Unnamed: 0,aweme_id,cid,create_time,text,reply_comment_total
0,7349672610927529249,7353231645930111776,2024-04-02,I went 4 times and have hated it all times,50
1,7352900587245014305,7353286350575469345,2024-04-02,Rome is really the city of love ‚Ä¶ and I live i...,23
2,7352900587245014305,7353109112106238752,2024-04-02,Rome is like a museum without roof üòç,18
3,7349672610927529249,7353574920545288993,2024-04-03,I want to experience Rome as a tourist (i live...,17
4,7349672610927529249,7353322352967271200,2024-04-02,Quelle p√©riode est le mieux pour y aller ? Mercii,13
...,...,...,...,...,...
104,7349672610927529249,7372811125262222086,2024-05-25,Rome is beautiful,0
105,7349672610927529249,7351428475365016352,2024-03-28,my favourite city‚ú®,0
106,7349672610927529249,7380245236709360390,2024-06-14,I was there two days ago. It's just an incredi...,0
107,7349672610927529249,7376726674639979296,2024-06-04,Rome is gorgüòç,0


Replies search

In [50]:
# set search criteria
replies_to_retrieve = 30 # 30 is a safe number = 1 credit!

replies = list()
for i in range(4): # change the range to get more!
    aweme_id = df_comments["aweme_id"][i]
    cid = df_comments["cid"][i]
    cursor = None
    for _ in range(int(replies_to_retrieve/30)):
        result = client.tiktok.post_comment_replies(
            aweme_id = aweme_id,
            comment_id = cid,
            cursor = cursor
        )
        replies.extend(result.data["comments"])
        cursor = result.data.get("nextCursor")
        if cursor is None:
            break

In [51]:
# convert json to dataframe
df = pd.json_normalize(replies)

# extract relevant info
df = df[['aweme_id', 'cid', 'reply_id', 'create_time', 'text']]

# format time to date in string format
df["create_time"] = [str(datetime.fromtimestamp(int(i)).date())
                                                for i in df["create_time"]]

# save
df_replies = df
df_replies.to_csv('rome_tiktok_hashtag_replies.csv', index=False)

In [52]:
display(df_replies)

Unnamed: 0,aweme_id,cid,reply_id,create_time,text
0,7349672610927529249,7353333101487358752,7353231645930111776,2024-04-02,ce ne faremo una ragione...‚ù§Ô∏èüáÆüáπ
1,7349672610927529249,7353333331507659553,7353231645930111776,2024-04-02,*well*
2,7349672610927529249,7355115073627407137,7353231645930111776,2024-04-07,Voglio andare a Roma
3,7349672610927529249,7363162373286904608,7353231645930111776,2024-04-29,SAMEEEE
4,7349672610927529249,7368083006227071751,7353231645930111776,2024-05-12,Io voglio andare a Roma but Io non parlo bene ...
...,...,...,...,...,...
82,7349672610927529249,7360266512425190176,7353574920545288993,2024-04-21,.
83,7349672610927529249,7358219639027581703,7353574920545288993,2024-04-15,üò¢
84,7349672610927529249,7357403833289589536,7353574920545288993,2024-04-13,literally
85,7349672610927529249,7354393013818655531,7353574920545288993,2024-04-05,no cuz I wish I could revisit rome as a touris...
