{"cells":[{"cell_type":"code","source":["import pandas as pd\n","import numpy as np\n","import re\n","from tqdm.auto import tqdm\n","import seaborn as sns\n","import pickle\n","import time\n","import matplotlib\n","import matplotlib.pyplot as plt\n","from scipy.sparse import csr_matrix\n","import scipy.sparse as sps"],"metadata":{"id":"4JjxeQqQhNM9","executionInfo":{"status":"ok","timestamp":1765743611855,"user_tz":-60,"elapsed":3843,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}}},"execution_count":1,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"dVAa-v0bYujf"},"source":["# Load stuff previously defined"]},{"cell_type":"code","source":["import spacy\n","!pip install --quiet spacymoji\n","from spacymoji import Emoji\n","nlp = spacy.load('en_core_web_sm')\n","nlp.tokenizer.token_match = re.compile(\"^#\\w+$\").match\n","nlp.add_pipe(\"emoji\", first=True)\n","\n","import emoji\n","def get_emoji_regexp():\n"," # Sort emoji by length to make sure multi-character emojis are\n"," # matched first\n"," emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)\n"," pattern = '(' + '|'.join(re.escape(u) for u in emojis) + ')'\n"," return re.compile(pattern)"],"metadata":{"id":"pwL9KcrOTLkd","colab":{"base_uri":"https://localhost:8080/"},"outputId":"575d6090-6d0d-433e-ab45-5546dbd4dda4","executionInfo":{"status":"ok","timestamp":1765743634911,"user_tz":-60,"elapsed":23048,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}}},"execution_count":2,"outputs":[{"output_type":"stream","name":"stderr","text":["<>:5: SyntaxWarning: invalid escape sequence '\\w'\n","<>:5: SyntaxWarning: invalid escape sequence '\\w'\n","/tmp/ipython-input-4268926626.py:5: SyntaxWarning: invalid escape sequence '\\w'\n"," nlp.tokenizer.token_match = re.compile(\"^#\\w+$\").match\n"]},{"output_type":"stream","name":"stdout","text":["\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/608.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m604.2/608.4 kB\u001b[0m \u001b[31m22.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m608.4/608.4 kB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h"]}]},{"cell_type":"code","source":["!pip install --quiet igraph\n","import igraph as ig"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ZGtTXttDlSXt","executionInfo":{"status":"ok","timestamp":1765744659249,"user_tz":-60,"elapsed":7193,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}},"outputId":"2d45daeb-2c7d-4236-b6ea-fdd55bf4c101"},"execution_count":28,"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/5.7 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.5/5.7 MB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.3/5.7 MB\u001b[0m \u001b[31m45.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m5.7/5.7 MB\u001b[0m \u001b[31m60.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.7/5.7 MB\u001b[0m \u001b[31m41.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h"]}]},{"cell_type":"code","source":["class SemanticNetwork:\n"," def __init__(self,nlp):\n"," self.nlp=nlp\n","\n"," def _clean_text_fun(self, text, to_keep=True,\n"," kr_list = [\"ADJ\",\"ADV\",\"EMOJI\",\"HASH\",\"NOUN\",\"PROPN\",\"VERB\"]):\n","\n"," # apply spacy\n"," doc = self.nlp(\" \".join(get_emoji_regexp().split(text)))\n"," # collect spacy outomes in in a list\n"," out = list()\n"," for token in doc:\n"," out.append((token.text, token.text+token.whitespace_,\n"," token.lemma_.lower(), token.pos_,\n"," token.text.startswith(\"#\"),\n"," token.text.startswith(\"@\"),\n"," emoji.purely_emoji(token.text),\n"," (token.text.startswith(\"https:\") or token.text.startswith(\"http:\"))\n"," ))\n"," # turn it into a dataframe\n"," df2 = pd.DataFrame(out)\n"," df2.columns = ['text', 'full_text', 'lemma', 'PoS',\n"," 'is_hashtag', 'is_mention', 'is_emoji', 'is_link']\n"," # build a PoS column that identifies hashtags, mentions, etc\n"," df2.insert(loc = 4, column = 'myPoS', value = df2['PoS'])\n"," df2.loc[df2['is_hashtag']==True,'myPoS'] = 'HASH'\n"," df2.loc[df2['is_mention']==True,'myPoS'] = 'MENT'\n"," df2.loc[df2['is_emoji']==True,'myPoS'] = 'EMOJI'\n"," df2.loc[df2['is_link']==True,'myPoS'] = 'HTML'\n"," # replace emojis lemmas with their description\n"," for tmp in doc._.emoji:\n"," df2.loc[tmp[1],'lemma'] = tmp[0]+\" \"+tmp[2]\n"," # keep/remove only what is asked\n"," if to_keep:\n"," out = [j for (i,j) in zip(df2['myPoS'],df2['lemma']) if i in kr_list]\n"," else:\n"," out = [j for (i,j) in zip(df2['myPoS'],df2['lemma']) if i not in kr_list]\n"," return out\n","\n"," # with to_keep you can choose if you want to keep or remove certain PoS\n"," #\n"," # with kr_list you can choose which POS tags to keep or remove,\n"," # to be chosen from the following list\n"," #\n"," # Universal POS Tags http://universaldependencies.org/u/pos/\n"," #\n"," # \"ADJ\": \"adjective\",\n"," # \"ADP\": \"adposition\",\n"," # \"ADV\": \"adverb\",\n"," # \"AUX\": \"auxiliary\",\n"," # \"CONJ\": \"conjunction\",\n"," # \"CCONJ\": \"coordinating conjunction\",\n"," # \"DET\": \"determiner\",\n"," # \"INTJ\": \"interjection\",\n"," # \"NOUN\": \"noun\",\n"," # \"NUM\": \"numeral\",\n"," # \"PART\": \"particle\",\n"," # \"PRON\": \"pronoun\",\n"," # \"PROPN\": \"proper noun\",\n"," # \"PUNCT\": \"punctuation\",\n"," # \"SCONJ\": \"subordinating conjunction\",\n"," # \"SYM\": \"symbol\",\n"," # \"VERB\": \"verb\",\n"," # \"X\": \"other\",\n"," # \"EOL\": \"end of line\",\n"," # \"SPACE\": \"space\"\n"," #\n"," # Internal tags\n"," #\n"," # \"EMOJI\" emojis,\n"," # \"HASH\" hastags\n"," # \"HTML\" web links,\n"," # \"MENT\" mentions\n","\n"," def clean_text(self, df, to_keep=True,\n"," kr_list = [\"ADJ\",\"ADV\",\"EMOJI\",\"HASH\",\"NOUN\",\"PROPN\",\"VERB\"]):\n","\n"," self.df = df.copy()\n"," for i in tqdm(range(len(df))):\n"," text = df.loc[i,'translated']\n"," self.df.loc[[i],'clean list'] = pd.Series([\n"," self._clean_text_fun(text, to_keep, kr_list)\n"," ], index=[i])\n","\n"," # extracts occurrence matrix Nwd, also returns the\n"," # documents actually in use and the words dictionary\n"," #\n"," # words occurring less than n_min times are discarded\n"," # words occurring more than n_max times are discarded\n"," #\n"," # documents with zero active words are discarded\n","\n"," def get_Nwd(self, n_min=2, n_max=1e10):\n","\n"," # capture execution time\n"," tic = time.time()\n","\n"," # collection of (unique) words\n"," clean_texts_list = list(self.df['clean list'])\n"," words = np.unique([item for sublist in clean_texts_list \\\n"," for item in sublist])\n"," Nw = len(words) # number of words (so far)\n"," # documents list\n"," Nd = len(clean_texts_list) # number of documents (so far)\n"," documents = np.array(range(Nd))\n","\n"," # occurrence matrix for words in documents\n"," words_dict = dict(zip(words,range(Nw))) # words dictionary\n"," Nwd = csr_matrix((Nw, Nd), dtype = np.int8).toarray()\n"," for i in range(Nd):\n"," for j in clean_texts_list[i]:\n"," Nwd[words_dict[j],i] += 1\n","\n"," # identify words used less than n_min or more than n_max\n"," select = ((np.sum(Nwd,axis=1) >= n_min) & \\\n"," (np.sum(Nwd,axis=1) < n_max))\n"," # explicitly print the most frequent ones\n"," print('removing words...')\n"," with np.printoptions(threshold=np.inf):\n"," print(words[(np.sum(Nwd,axis=1)>=n_max)])\n"," # remove them\n"," Nwd = Nwd[select,:]\n"," words = words[select]\n"," # remove documents that do not contain words\n"," select = (np.sum(Nwd,axis=0)>0)\n"," Nwd = csr_matrix(Nwd[:,select])\n"," documents = documents[select]\n","\n"," # capture execution time\n"," print(f'Occurrence matrix: execution time {time.time()-tic} [s]')\n","\n"," self.Nwd = Nwd\n"," self.words = words\n"," self.documents = documents\n","\n"," # plot words occurrences\n"," plt.figure(figsize=(4, 3))\n"," plt.semilogy(-np.sort(-np.asarray(np.sum(Nwd,axis=1)).reshape(-1)))\n"," plt.grid(True)\n"," plt.xlabel('word id')\n"," plt.ylabel('# of occurrences')\n"," plt.title(\"words occurrencies\");\n"," plt.show()\n","\n"," # build other matrices\n"," # equally likely documents case!\n"," Pwd = Nwd/Nwd.sum(axis=0).flatten()/Nwd.shape[1]\n"," # words and document matrices\n"," pd = Pwd.sum(axis=0).flatten()\n"," Pww = (Pwd/pd).dot(Pwd.T)\n"," pw = Pwd.sum(axis=1).flatten()\n"," Pdd = (Pwd.T/pw).dot(Pwd)\n"," self.Pwd = Pwd\n"," self.Pdd = Pdd\n"," self.pd = pd"],"metadata":{"id":"93Roe9iggNDG","executionInfo":{"status":"ok","timestamp":1765743674077,"user_tz":-60,"elapsed":103,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["def logg(x):\n"," y = np.log(x)\n"," y[x==0] = 0\n"," return y\n","\n","# NMI\n","def nmi_fn(A): # A = Pwc\n"," aw = A.sum(axis=1).flatten() # word probability\n"," ac = A.sum(axis=0).flatten() # class probability\n"," Hc = np.multiply(ac,-logg(ac)).sum() # class entropy\n"," A2 = ((A/ac).T/aw).T\n"," A2.data = logg(A2.data)\n"," y = (A.multiply(A2)).sum()/Hc\n"," return y\n","\n","# modularity\n","def modularity_fn(A):\n"," y = A.trace()-(A.sum(axis=0)*A.sum(axis=1)).item()\n"," return y\n","\n","# Ncut\n","def ncut_fn(A):\n"," y = ((A.sum(axis=0)-A.diagonal())/A.sum(axis=0)).mean()\n"," return y\n","\n","# Infomap - 1\n","def pagerank_fn(M,q,c=.85,it=60):\n"," r = q.copy() # ranking matrix, initialized to q (copy)\n"," for k in range(it): # slow cycle\n"," r = c*M.dot(r) + (1-c)*q\n"," return r\n","\n","# Infomap - 2\n","def _infomap_fn(v):\n"," y = -(v.data*logg(v.data/v.sum())).sum()\n"," return y\n","\n","# Infomap - 3\n","def infomap_rank_fn(Pdd):\n"," # transition matrix\n"," pd = Pdd.sum(axis=0).flatten()\n"," M = sps.csr_matrix(Pdd/pd)\n"," # pagerank vector - faster than r = pagerank_fn(M,q)\n"," G = ig.Graph.Adjacency((M > 0).toarray().tolist())\n"," G.es['weight'] = np.array(M[M.nonzero()])[0]\n"," r = G.pagerank(weights='weight')\n"," r = (sps.csr_matrix(np.array(r))).T\n"," return r\n","\n","# Infomap - 4\n","def infomap_fn(C,Pdd,r):\n"," pd = Pdd.sum(axis=0).flatten()\n"," M = Pdd/pd # transition matrix\n"," # extract vectors\n"," z = (C.T).dot(sps.diags(r.toarray().flatten()))\n"," q = sps.csr_matrix((1,z.shape[0]))\n"," c = .85\n"," for i in range(z.shape[0]):\n"," tmp = ((C[:,i].transpose()).dot(M)).dot(z[i].transpose())\n"," q[0,i] = (1-(1-c)*C[:,i].sum()/M.shape[0])*z[i].sum()-c*tmp[0,0]\n"," # extract statistics\n"," y = _infomap_fn(q)\n"," for i in range(z.shape[0]):\n"," y += _infomap_fn(sps.hstack([z[i],sps.csr_matrix([[q[0,i]]])]))\n"," # normalize\n"," y = (y/_infomap_fn(pd))-1\n"," return y"],"metadata":{"id":"r1QkkJSSngt4","executionInfo":{"status":"ok","timestamp":1765743698089,"user_tz":-60,"elapsed":45,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}}},"execution_count":4,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"ERt8qIRCKo6M"},"source":["# Load data and run BERTopic"]},{"cell_type":"code","source":["# load data\n","df = pd.read_excel(\"/content/tweets_greta_translated.xlsx\")\n","df.drop_duplicates(subset=[\"text\"],inplace=True)\n","df.reset_index(drop=True, inplace=True)"],"metadata":{"id":"5RDfMXIohuUv","executionInfo":{"status":"ok","timestamp":1765743756003,"user_tz":-60,"elapsed":1480,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}}},"execution_count":6,"outputs":[]},{"cell_type":"code","source":["!pip install --quiet bertopic\n","from bertopic import BERTopic"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"THHR7ie0C_f1","outputId":"1889666f-4b1f-4615-a16b-a66bab237d52","executionInfo":{"status":"ok","timestamp":1765743820979,"user_tz":-60,"elapsed":51816,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}}},"execution_count":8,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/hdbscan/robust_single_linkage_.py:175: SyntaxWarning: invalid escape sequence '\\{'\n"," $max \\{ core_k(a), core_k(b), 1/\\alpha d(a,b) \\}$.\n"]}]},{"cell_type":"code","execution_count":31,"metadata":{"id":"6RRz8RjDIfCn","colab":{"base_uri":"https://localhost:8080/","height":1000},"outputId":"e056676e-aff7-480c-dce3-17dffaf1dee1","executionInfo":{"status":"ok","timestamp":1765744852453,"user_tz":-60,"elapsed":26884,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}}},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Topic Count Name \\\n","0 -1 1330 -1_the_to_climatechange_of \n","1 0 684 0_climatechange_to_the_climate \n","2 1 314 1_sustainability_plastic_environment_recycling \n","3 2 167 2_sdgs_the_development_to \n","4 3 125 3_food_agriculture_farming_farmers \n","5 4 94 4_ice_arctic_sea_glaciers \n","6 5 80 5_africa_sdgs_in_nigeria \n","7 6 69 6_solar_energy_implement_panels \n","8 7 60 7_trees_forests_planting_forest \n","9 8 60 8_oil_carbon_emissions_spend \n","10 9 47 9_cdnpoli_canadians_canada_trudeau \n","11 10 46 10_electric_cars_norway_ev \n","12 11 43 11_water_worldwaterday_the_environment \n","13 12 41 12_sdgs_women_womensday_gender \n","14 13 40 13_power_renewables_wind_electricity \n","15 14 35 14_auspol_australia_australian_government \n","16 15 34 15_cyclone_mozambique_cycloneidai_idai \n","17 16 24 16_insects_insect_numbers_crashing \n","18 17 22 17_women_gender_internationalwomensday_to \n","19 18 21 18_ai_iot_robotics_automation \n","20 19 21 19_wildlife_species_environment_animals \n","21 20 21 20_temperature_record_warmest_records \n","22 21 19 21_airpollution_air_pollution_tb \n","23 22 19 22_mumbai_indian_india_28 \n","24 23 18 23_reef_coral_barrier_bleaching \n","25 24 17 24_eu_2050_vote_for \n","26 25 17 25_canada_warming_twice_report \n","27 26 17 26_deal_green_greennewdeal_new \n","28 27 17 27_methane_emissions_fossil_fuels \n","29 28 16 28_fish_lakes_fisheries_oceans \n","30 29 15 29_demswork4usa_progressives_healthcare_medicaid \n","31 30 15 30_attenborough_david_sir_documentary \n","32 31 15 31_cities_city_climate_httpstcotnuyoisan0 \n","33 32 13 32_secret_garden_tampa_climate \n","34 33 13 33_ocean_marine_oceans_whales \n","35 34 12 34_fracking_judge_drilling_wyoming \n","36 35 12 35_kashthefuturist_cberrl_amazingchevvolt_gezg... \n","37 36 12 36_nuclear_energy_httpstcowzwsvj71mq_nukes \n","38 37 11 37_health_immune_population_single \n","39 38 11 38_co2_atmosphere_were_was \n","\n"," Representation \\\n","0 [the, to, climatechange, of, and, is, in, on, ... \n","1 [climatechange, to, the, climate, and, in, is,... \n","2 [sustainability, plastic, environment, recycli... \n","3 [sdgs, the, development, to, on, goals, amp, i... \n","4 [food, agriculture, farming, farmers, to, soil... \n","5 [ice, arctic, sea, glaciers, melting, in, the,... \n","6 [africa, sdgs, in, nigeria, for, the, leavenoo... \n","7 [solar, energy, implement, panels, lets, solut... \n","8 [trees, forests, planting, forest, to, defores... \n","9 [oil, carbon, emissions, spend, and, its, lobb... \n","10 [cdnpoli, canadians, canada, trudeau, the, pri... \n","11 [electric, cars, norway, ev, crisis, air, impl... \n","12 [water, worldwaterday, the, environment, waste... \n","13 [sdgs, women, womensday, gender, iwd2019, equa... \n","14 [power, renewables, wind, electricity, renewab... \n","15 [auspol, australia, australian, government, ad... \n","16 [cyclone, mozambique, cycloneidai, idai, zimba... \n","17 [insects, insect, numbers, crashing, pesticide... \n","18 [women, gender, internationalwomensday, to, in... \n","19 [ai, iot, robotics, automation, smart, cities,... \n","20 [wildlife, species, environment, animals, tige... \n","21 [temperature, record, warmest, records, februa... \n","22 [airpollution, air, pollution, tb, asthma, smo... \n","23 [mumbai, indian, india, 28, globalwarming, nee... \n","24 [reef, coral, barrier, bleaching, reefs, great... \n","25 [eu, 2050, vote, for, european, climate, brexi... \n","26 [canada, warming, twice, report, rate, leaked,... \n","27 [deal, green, greennewdeal, new, hill, the, ca... \n","28 [methane, emissions, fossil, fuels, fracking, ... \n","29 [fish, lakes, fisheries, oceans, eutrophicatio... \n","30 [demswork4usa, progressives, healthcare, medic... \n","31 [attenborough, david, sir, documentary, bbc, p... \n","32 [cities, city, climate, httpstcotnuyoisan0, ve... \n","33 [secret, garden, tampa, climate, fighting, bac... \n","34 [ocean, marine, oceans, whales, lifeunderwater... \n","35 [fracking, judge, drilling, wyoming, lands, bl... \n","36 [kashthefuturist, cberrl, amazingchevvolt, gez... \n","37 [nuclear, energy, httpstcowzwsvj71mq, nukes, p... \n","38 [health, immune, population, single, threats, ... \n","39 [co2, atmosphere, were, was, 2016, usa, high, ... \n","\n"," Representative_Docs \n","0 [\"This is not about #climatechange anymore, th... \n","1 [It's elections time. Time for a new party: on... \n","2 [Did you, like me, think tins, glass and paper... \n","3 [The Sustainable\\nDevelopment Goals (#SDGs):\\n... \n","4 [Innovation is the central driving force which... \n","5 [ARCTIC SEA-ICE EXPLOSION : Largest Increase I... \n","6 [\"Africa bears the brunt of #ClimateChange and... \n","7 [Chinese scientists have invented #solar panel... \n","8 [Pakistan is planting 10 billion trees. To fig... \n","9 [According to a new report, the five largest s... \n","10 [#BigOil knew their products would cause #cli... \n","11 [Norway is banning cars from the centre of Osl... \n","12 [Just started a course this week on Water and ... \n","13 [I am very happy to share my experience with s... \n","14 [Wow. #Denmark is using the power of the ocean... \n","15 [This week Scott Morrison doubled down on Tony... \n","16 [At Least 150 Dead, 1.5 Million Impacted as Cy... \n","17 [Insect numbers are crashing in huge numbers d... \n","18 [@NamugerwaLeah @GretaThunberg @BBCAfrica @Ext... \n","19 [“Imagine creating a world that is driven by e... \n","20 [#planet #sustainable #sustainability \\n#susta... \n","21 [Animation showing the evolution of global mea... \n","22 [Air pollution is now more deadly than war, sm... \n","23 [Can someone urgently verify this before I fai... \n","24 [The Great Barrier Reef is being battered by #... \n","25 [Yesss! European youth is rising for the clima... \n","26 [Climate change is warming Canada at a rate tw... \n","27 [PRESS CONFERENCE: Tomorrow, I'll be unveiling... \n","28 [“The science is crystal clear, we need to pha... \n","29 [RT @SOCCOMProject: Oceans’ fever means fewer ... \n","30 [🔁#Florida CD26 #Election2020-Nov-3.\\n\\nSuppor... \n","31 [Sir David Attenborough to present climate cha... \n","32 [Venice\\n\\nSo full of history, art, and poetry... \n","33 [Northwestern tribes and the University of Was... \n","34 [30×30: groundbreaking scientific study maps o... \n","35 [It has now become clear, with guidance from t... \n","36 [@Jackthelad1947 @NeurozoInnovat1 @ristori20 @... \n","37 [Given that a new nuclear power plant getting ... \n","38 [#Climatechange has been identified as the big... \n","39 [@realDonaldTrump @foxandfriends Earth's abili... "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
TopicCountNameRepresentationRepresentative_Docs
0-11330-1_the_to_climatechange_of[the, to, climatechange, of, and, is, in, on, ...[\"This is not about #climatechange anymore, th...
106840_climatechange_to_the_climate[climatechange, to, the, climate, and, in, is,...[It's elections time. Time for a new party: on...
213141_sustainability_plastic_environment_recycling[sustainability, plastic, environment, recycli...[Did you, like me, think tins, glass and paper...
321672_sdgs_the_development_to[sdgs, the, development, to, on, goals, amp, i...[The Sustainable\\nDevelopment Goals (#SDGs):\\n...
431253_food_agriculture_farming_farmers[food, agriculture, farming, farmers, to, soil...[Innovation is the central driving force which...
54944_ice_arctic_sea_glaciers[ice, arctic, sea, glaciers, melting, in, the,...[ARCTIC SEA-ICE EXPLOSION : Largest Increase I...
65805_africa_sdgs_in_nigeria[africa, sdgs, in, nigeria, for, the, leavenoo...[\"Africa bears the brunt of #ClimateChange and...
76696_solar_energy_implement_panels[solar, energy, implement, panels, lets, solut...[Chinese scientists have invented #solar panel...
87607_trees_forests_planting_forest[trees, forests, planting, forest, to, defores...[Pakistan is planting 10 billion trees. To fig...
98608_oil_carbon_emissions_spend[oil, carbon, emissions, spend, and, its, lobb...[According to a new report, the five largest s...
109479_cdnpoli_canadians_canada_trudeau[cdnpoli, canadians, canada, trudeau, the, pri...[#BigOil knew their products would cause #cli...
11104610_electric_cars_norway_ev[electric, cars, norway, ev, crisis, air, impl...[Norway is banning cars from the centre of Osl...
12114311_water_worldwaterday_the_environment[water, worldwaterday, the, environment, waste...[Just started a course this week on Water and ...
13124112_sdgs_women_womensday_gender[sdgs, women, womensday, gender, iwd2019, equa...[I am very happy to share my experience with s...
14134013_power_renewables_wind_electricity[power, renewables, wind, electricity, renewab...[Wow. #Denmark is using the power of the ocean...
15143514_auspol_australia_australian_government[auspol, australia, australian, government, ad...[This week Scott Morrison doubled down on Tony...
16153415_cyclone_mozambique_cycloneidai_idai[cyclone, mozambique, cycloneidai, idai, zimba...[At Least 150 Dead, 1.5 Million Impacted as Cy...
17162416_insects_insect_numbers_crashing[insects, insect, numbers, crashing, pesticide...[Insect numbers are crashing in huge numbers d...
18172217_women_gender_internationalwomensday_to[women, gender, internationalwomensday, to, in...[@NamugerwaLeah @GretaThunberg @BBCAfrica @Ext...
19182118_ai_iot_robotics_automation[ai, iot, robotics, automation, smart, cities,...[“Imagine creating a world that is driven by e...
20192119_wildlife_species_environment_animals[wildlife, species, environment, animals, tige...[#planet #sustainable #sustainability \\n#susta...
21202120_temperature_record_warmest_records[temperature, record, warmest, records, februa...[Animation showing the evolution of global mea...
22211921_airpollution_air_pollution_tb[airpollution, air, pollution, tb, asthma, smo...[Air pollution is now more deadly than war, sm...
23221922_mumbai_indian_india_28[mumbai, indian, india, 28, globalwarming, nee...[Can someone urgently verify this before I fai...
24231823_reef_coral_barrier_bleaching[reef, coral, barrier, bleaching, reefs, great...[The Great Barrier Reef is being battered by #...
25241724_eu_2050_vote_for[eu, 2050, vote, for, european, climate, brexi...[Yesss! European youth is rising for the clima...
26251725_canada_warming_twice_report[canada, warming, twice, report, rate, leaked,...[Climate change is warming Canada at a rate tw...
27261726_deal_green_greennewdeal_new[deal, green, greennewdeal, new, hill, the, ca...[PRESS CONFERENCE: Tomorrow, I'll be unveiling...
28271727_methane_emissions_fossil_fuels[methane, emissions, fossil, fuels, fracking, ...[“The science is crystal clear, we need to pha...
29281628_fish_lakes_fisheries_oceans[fish, lakes, fisheries, oceans, eutrophicatio...[RT @SOCCOMProject: Oceans’ fever means fewer ...
30291529_demswork4usa_progressives_healthcare_medicaid[demswork4usa, progressives, healthcare, medic...[🔁#Florida CD26 #Election2020-Nov-3.\\n\\nSuppor...
31301530_attenborough_david_sir_documentary[attenborough, david, sir, documentary, bbc, p...[Sir David Attenborough to present climate cha...
32311531_cities_city_climate_httpstcotnuyoisan0[cities, city, climate, httpstcotnuyoisan0, ve...[Venice\\n\\nSo full of history, art, and poetry...
33321332_secret_garden_tampa_climate[secret, garden, tampa, climate, fighting, bac...[Northwestern tribes and the University of Was...
34331333_ocean_marine_oceans_whales[ocean, marine, oceans, whales, lifeunderwater...[30×30: groundbreaking scientific study maps o...
35341234_fracking_judge_drilling_wyoming[fracking, judge, drilling, wyoming, lands, bl...[It has now become clear, with guidance from t...
36351235_kashthefuturist_cberrl_amazingchevvolt_gezg...[kashthefuturist, cberrl, amazingchevvolt, gez...[@Jackthelad1947 @NeurozoInnovat1 @ristori20 @...
37361236_nuclear_energy_httpstcowzwsvj71mq_nukes[nuclear, energy, httpstcowzwsvj71mq, nukes, p...[Given that a new nuclear power plant getting ...
38371137_health_immune_population_single[health, immune, population, single, threats, ...[#Climatechange has been identified as the big...
39381138_co2_atmosphere_were_was[co2, atmosphere, were, was, 2016, usa, high, ...[@realDonaldTrump @foxandfriends Earth's abili...
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"topic_model\",\n \"rows\": 40,\n \"fields\": [\n {\n \"column\": \"Topic\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11,\n \"min\": -1,\n \"max\": 38,\n \"num_unique_values\": 40,\n \"samples\": [\n 18,\n 15,\n 14\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 231,\n \"min\": 11,\n \"max\": 1330,\n \"num_unique_values\": 27,\n \"samples\": [\n 60,\n 40,\n 47\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 40,\n \"samples\": [\n \"18_ai_iot_robotics_automation\",\n \"15_cyclone_mozambique_cycloneidai_idai\",\n \"14_auspol_australia_australian_government\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Representation\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Representative_Docs\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":31}],"source":["# run BERTopic ... it takes about 1 min\n","topic_model = BERTopic(nr_topics=\"auto\")\n","docs = df['translated']\n","topics, probs = topic_model.fit_transform(docs)\n","topic_model.get_topic_info()"]},{"cell_type":"code","source":["# reduce outliers (they are very many)\n","new_topics = topic_model.reduce_outliers(docs, topics)\n","topic_model.update_topics(docs, topics=new_topics)\n","topic_model.get_topic_info()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000},"id":"Uy1UEVPpize0","executionInfo":{"status":"ok","timestamp":1765744852866,"user_tz":-60,"elapsed":401,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}},"outputId":"f6cd5b76-e4f3-40e1-9199-1ec6fed81e17"},"execution_count":32,"outputs":[{"output_type":"stream","name":"stderr","text":["2025-12-14 20:40:52,645 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.\n"]},{"output_type":"execute_result","data":{"text/plain":[" Topic Count Name \\\n","0 0 1343 0_climatechange_the_to_climate \n","1 1 479 1_sustainability_environment_plastic_and \n","2 2 273 2_sdgs_the_development_to \n","3 3 143 3_food_agriculture_farming_farmers \n","4 4 104 4_ice_arctic_sea_melting \n","5 5 92 5_africa_sdgs_in_for \n","6 6 96 6_solar_energy_implement_panelsnotpipelines \n","7 7 69 7_trees_forests_planting_forest \n","8 8 85 8_oil_carbon_emissions_gas \n","9 9 66 9_cdnpoli_canada_canadians_price \n","10 10 55 10_electric_cars_norway_tesla \n","11 11 63 11_water_environment_the_of \n","12 12 46 12_sdgs_women_gender_womensday \n","13 13 69 13_renewables_power_renewableenergy_wind \n","14 14 53 14_auspol_australia_australian_ausvotes2019 \n","15 15 35 15_cyclone_mozambique_cycloneidai_idai \n","16 16 26 16_insects_insect_numbers_pesticides \n","17 17 27 17_women_girls_gender_that \n","18 18 31 18_ai_iot_robotics_ronaldvanloon \n","19 19 32 19_wildlife_nature_species_animals \n","20 20 37 20_temperature_phd_kidsbeachgarden_masters \n","21 21 38 21_air_pollution_airpollution_smoking \n","22 22 20 22_mumbai_indian_india_28 \n","23 23 19 23_reef_coral_barrier_bleaching \n","24 24 24 24_eu_for_european_2050 \n","25 25 34 25_warming_canada_global_twice \n","26 26 35 26_green_deal_greennewdeal_new \n","27 27 26 27_methane_emissions_fuels_than \n","28 28 17 28_fish_lakes_fisheries_oceans \n","29 29 18 29_demswork4usa_progressives_healthcare_medicaid \n","30 30 20 30_attenborough_david_sir_documentary \n","31 31 28 31_cities_city_climate_change \n","32 32 21 32_garden_climate_back_secret \n","33 33 27 33_ocean_oceans_marine_amp \n","34 34 15 34_fracking_judge_drilling_court \n","35 35 14 35_kashthefuturist_jackthelad1947_charluv2011_... \n","36 36 18 36_nuclear_energy_policy_greenpeace \n","37 37 23 37_health_problem_is_immune \n","38 38 26 38_co2_were_atmosphere_million \n","\n"," Representation \\\n","0 [climatechange, the, to, climate, of, is, and,... \n","1 [sustainability, environment, plastic, and, fo... \n","2 [sdgs, the, development, to, amp, on, for, of,... \n","3 [food, agriculture, farming, farmers, soil, to... \n","4 [ice, arctic, sea, melting, glaciers, in, the,... \n","5 [africa, sdgs, in, for, nigeria, of, the, amp,... \n","6 [solar, energy, implement, panelsnotpipelines,... \n","7 [trees, forests, planting, forest, deforestati... \n","8 [oil, carbon, emissions, gas, and, its, climat... \n","9 [cdnpoli, canada, canadians, price, trudeau, t... \n","10 [electric, cars, norway, tesla, ev, energy, cr... \n","11 [water, environment, the, of, amp, is, and, wo... \n","12 [sdgs, women, gender, womensday, iwd2019, girl... \n","13 [renewables, power, renewableenergy, wind, ene... \n","14 [auspol, australia, australian, ausvotes2019, ... \n","15 [cyclone, mozambique, cycloneidai, idai, zimba... \n","16 [insects, insect, numbers, pesticides, crashin... \n","17 [women, girls, gender, that, to, international... \n","18 [ai, iot, robotics, ronaldvanloon, automation,... \n","19 [wildlife, nature, species, animals, environme... \n","20 [temperature, phd, kidsbeachgarden, masters, b... \n","21 [air, pollution, airpollution, smoking, health... \n","22 [mumbai, indian, india, 28, globalwarming, nee... \n","23 [reef, coral, barrier, bleaching, reefs, great... \n","24 [eu, for, european, 2050, vote, remain, climat... \n","25 [warming, canada, global, twice, report, rate,... \n","26 [green, deal, greennewdeal, new, capitalism, v... \n","27 [methane, emissions, fuels, than, fossil, burn... \n","28 [fish, lakes, fisheries, oceans, study, eutrop... \n","29 [demswork4usa, progressives, healthcare, medic... \n","30 [attenborough, david, sir, documentary, presen... \n","31 [cities, city, climate, change, of, how, it, w... \n","32 [garden, climate, back, secret, protect, vulne... \n","33 [ocean, oceans, marine, amp, whales, the, prot... \n","34 [fracking, judge, drilling, court, mining, gas... \n","35 [kashthefuturist, jackthelad1947, charluv2011,... \n","36 [nuclear, energy, policy, greenpeace, uranium,... \n","37 [health, problem, is, immune, population, sing... \n","38 [co2, were, atmosphere, million, levels, there... \n","\n"," Representative_Docs \n","0 [It's elections time. Time for a new party: on... \n","1 [Did you, like me, think tins, glass and paper... \n","2 [The Sustainable\\nDevelopment Goals (#SDGs):\\n... \n","3 [Innovation is the central driving force which... \n","4 [ARCTIC SEA-ICE EXPLOSION : Largest Increase I... \n","5 [\"Africa bears the brunt of #ClimateChange and... \n","6 [Chinese scientists have invented #solar panel... \n","7 [Pakistan is planting 10 billion trees. To fig... \n","8 [According to a new report, the five largest s... \n","9 [#BigOil knew their products would cause #cli... \n","10 [Norway is banning cars from the centre of Osl... \n","11 [Just started a course this week on Water and ... \n","12 [I am very happy to share my experience with s... \n","13 [Wow. #Denmark is using the power of the ocean... \n","14 [This week Scott Morrison doubled down on Tony... \n","15 [At Least 150 Dead, 1.5 Million Impacted as Cy... \n","16 [Insect numbers are crashing in huge numbers d... \n","17 [@NamugerwaLeah @GretaThunberg @BBCAfrica @Ext... \n","18 [“Imagine creating a world that is driven by e... \n","19 [#planet #sustainable #sustainability \\n#susta... \n","20 [Animation showing the evolution of global mea... \n","21 [Air pollution is now more deadly than war, sm... \n","22 [Can someone urgently verify this before I fai... \n","23 [The Great Barrier Reef is being battered by #... \n","24 [Yesss! European youth is rising for the clima... \n","25 [Climate change is warming Canada at a rate tw... \n","26 [PRESS CONFERENCE: Tomorrow, I'll be unveiling... \n","27 [“The science is crystal clear, we need to pha... \n","28 [RT @SOCCOMProject: Oceans’ fever means fewer ... \n","29 [🔁#Florida CD26 #Election2020-Nov-3.\\n\\nSuppor... \n","30 [Sir David Attenborough to present climate cha... \n","31 [Venice\\n\\nSo full of history, art, and poetry... \n","32 [Northwestern tribes and the University of Was... \n","33 [30×30: groundbreaking scientific study maps o... \n","34 [It has now become clear, with guidance from t... \n","35 [@Jackthelad1947 @NeurozoInnovat1 @ristori20 @... \n","36 [Given that a new nuclear power plant getting ... \n","37 [#Climatechange has been identified as the big... \n","38 [@realDonaldTrump @foxandfriends Earth's abili... "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
TopicCountNameRepresentationRepresentative_Docs
0013430_climatechange_the_to_climate[climatechange, the, to, climate, of, is, and,...[It's elections time. Time for a new party: on...
114791_sustainability_environment_plastic_and[sustainability, environment, plastic, and, fo...[Did you, like me, think tins, glass and paper...
222732_sdgs_the_development_to[sdgs, the, development, to, amp, on, for, of,...[The Sustainable\\nDevelopment Goals (#SDGs):\\n...
331433_food_agriculture_farming_farmers[food, agriculture, farming, farmers, soil, to...[Innovation is the central driving force which...
441044_ice_arctic_sea_melting[ice, arctic, sea, melting, glaciers, in, the,...[ARCTIC SEA-ICE EXPLOSION : Largest Increase I...
55925_africa_sdgs_in_for[africa, sdgs, in, for, nigeria, of, the, amp,...[\"Africa bears the brunt of #ClimateChange and...
66966_solar_energy_implement_panelsnotpipelines[solar, energy, implement, panelsnotpipelines,...[Chinese scientists have invented #solar panel...
77697_trees_forests_planting_forest[trees, forests, planting, forest, deforestati...[Pakistan is planting 10 billion trees. To fig...
88858_oil_carbon_emissions_gas[oil, carbon, emissions, gas, and, its, climat...[According to a new report, the five largest s...
99669_cdnpoli_canada_canadians_price[cdnpoli, canada, canadians, price, trudeau, t...[#BigOil knew their products would cause #cli...
10105510_electric_cars_norway_tesla[electric, cars, norway, tesla, ev, energy, cr...[Norway is banning cars from the centre of Osl...
11116311_water_environment_the_of[water, environment, the, of, amp, is, and, wo...[Just started a course this week on Water and ...
12124612_sdgs_women_gender_womensday[sdgs, women, gender, womensday, iwd2019, girl...[I am very happy to share my experience with s...
13136913_renewables_power_renewableenergy_wind[renewables, power, renewableenergy, wind, ene...[Wow. #Denmark is using the power of the ocean...
14145314_auspol_australia_australian_ausvotes2019[auspol, australia, australian, ausvotes2019, ...[This week Scott Morrison doubled down on Tony...
15153515_cyclone_mozambique_cycloneidai_idai[cyclone, mozambique, cycloneidai, idai, zimba...[At Least 150 Dead, 1.5 Million Impacted as Cy...
16162616_insects_insect_numbers_pesticides[insects, insect, numbers, pesticides, crashin...[Insect numbers are crashing in huge numbers d...
17172717_women_girls_gender_that[women, girls, gender, that, to, international...[@NamugerwaLeah @GretaThunberg @BBCAfrica @Ext...
18183118_ai_iot_robotics_ronaldvanloon[ai, iot, robotics, ronaldvanloon, automation,...[“Imagine creating a world that is driven by e...
19193219_wildlife_nature_species_animals[wildlife, nature, species, animals, environme...[#planet #sustainable #sustainability \\n#susta...
20203720_temperature_phd_kidsbeachgarden_masters[temperature, phd, kidsbeachgarden, masters, b...[Animation showing the evolution of global mea...
21213821_air_pollution_airpollution_smoking[air, pollution, airpollution, smoking, health...[Air pollution is now more deadly than war, sm...
22222022_mumbai_indian_india_28[mumbai, indian, india, 28, globalwarming, nee...[Can someone urgently verify this before I fai...
23231923_reef_coral_barrier_bleaching[reef, coral, barrier, bleaching, reefs, great...[The Great Barrier Reef is being battered by #...
24242424_eu_for_european_2050[eu, for, european, 2050, vote, remain, climat...[Yesss! European youth is rising for the clima...
25253425_warming_canada_global_twice[warming, canada, global, twice, report, rate,...[Climate change is warming Canada at a rate tw...
26263526_green_deal_greennewdeal_new[green, deal, greennewdeal, new, capitalism, v...[PRESS CONFERENCE: Tomorrow, I'll be unveiling...
27272627_methane_emissions_fuels_than[methane, emissions, fuels, than, fossil, burn...[“The science is crystal clear, we need to pha...
28281728_fish_lakes_fisheries_oceans[fish, lakes, fisheries, oceans, study, eutrop...[RT @SOCCOMProject: Oceans’ fever means fewer ...
29291829_demswork4usa_progressives_healthcare_medicaid[demswork4usa, progressives, healthcare, medic...[🔁#Florida CD26 #Election2020-Nov-3.\\n\\nSuppor...
30302030_attenborough_david_sir_documentary[attenborough, david, sir, documentary, presen...[Sir David Attenborough to present climate cha...
31312831_cities_city_climate_change[cities, city, climate, change, of, how, it, w...[Venice\\n\\nSo full of history, art, and poetry...
32322132_garden_climate_back_secret[garden, climate, back, secret, protect, vulne...[Northwestern tribes and the University of Was...
33332733_ocean_oceans_marine_amp[ocean, oceans, marine, amp, whales, the, prot...[30×30: groundbreaking scientific study maps o...
34341534_fracking_judge_drilling_court[fracking, judge, drilling, court, mining, gas...[It has now become clear, with guidance from t...
35351435_kashthefuturist_jackthelad1947_charluv2011_...[kashthefuturist, jackthelad1947, charluv2011,...[@Jackthelad1947 @NeurozoInnovat1 @ristori20 @...
36361836_nuclear_energy_policy_greenpeace[nuclear, energy, policy, greenpeace, uranium,...[Given that a new nuclear power plant getting ...
37372337_health_problem_is_immune[health, problem, is, immune, population, sing...[#Climatechange has been identified as the big...
38382638_co2_were_atmosphere_million[co2, were, atmosphere, million, levels, there...[@realDonaldTrump @foxandfriends Earth's abili...
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"topic_model\",\n \"rows\": 39,\n \"fields\": [\n {\n \"column\": \"Topic\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11,\n \"min\": 0,\n \"max\": 38,\n \"num_unique_values\": 39,\n \"samples\": [\n 33,\n 36,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 221,\n \"min\": 14,\n \"max\": 1343,\n \"num_unique_values\": 32,\n \"samples\": [\n 15,\n 26,\n 34\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 39,\n \"samples\": [\n \"33_ocean_oceans_marine_amp\",\n \"36_nuclear_energy_policy_greenpeace\",\n \"4_ice_arctic_sea_melting\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Representation\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Representative_Docs\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":32}]},{"cell_type":"code","source":["topic_model.visualize_topics()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":667},"id":"0NJP2djOez5S","outputId":"1886fc71-e463-4da3-8bca-d0e7cf70cab3","executionInfo":{"status":"ok","timestamp":1765744852921,"user_tz":-60,"elapsed":54,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}}},"execution_count":33,"outputs":[{"output_type":"display_data","data":{"text/html":["\n","\n","\n","
\n","
\n","\n",""]},"metadata":{}}]},{"cell_type":"code","source":["topic_model.visualize_heatmap()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":837},"id":"JLqCLerTe5W1","outputId":"47610ed3-f0c8-4ff4-8746-032c43defc34","executionInfo":{"status":"ok","timestamp":1765744852985,"user_tz":-60,"elapsed":60,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}}},"execution_count":34,"outputs":[{"output_type":"display_data","data":{"text/html":["\n","\n","\n","
\n","
\n","\n",""]},"metadata":{}}]},{"cell_type":"code","source":["topic_model.visualize_documents(docs)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":787},"id":"Am6qJRHMfbLz","outputId":"81162800-2a3d-4a50-c88a-0f481dfb6bdb","executionInfo":{"status":"ok","timestamp":1765744875092,"user_tz":-60,"elapsed":22106,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}}},"execution_count":35,"outputs":[{"output_type":"display_data","data":{"text/html":["\n","\n","\n","
\n","
\n","\n",""]},"metadata":{}}]},{"cell_type":"code","source":["topic_model.visualize_barchart(top_n_topics=100, n_words=15)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000},"id":"1aPhNrsBfqKs","outputId":"5b784fc2-3426-492b-b847-4cca3a3bb618","executionInfo":{"status":"ok","timestamp":1765744875104,"user_tz":-60,"elapsed":3,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}}},"execution_count":36,"outputs":[{"output_type":"display_data","data":{"text/html":["\n","\n","\n","
\n","
\n","\n",""]},"metadata":{}}]},{"cell_type":"markdown","source":["# Get measures on topic assignment"],"metadata":{"id":"6AGeC7r3mg21"}},{"cell_type":"code","source":["# load stored data\n","with open(\"/content/semantic_net.pkl\", 'rb') as f:\n"," in_data = pickle.load(f)"],"metadata":{"id":"rXMa7cU8j1Ld","executionInfo":{"status":"ok","timestamp":1765744346669,"user_tz":-60,"elapsed":525,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}}},"execution_count":18,"outputs":[]},{"cell_type":"code","source":["# build C matrix\n","b_topics = np.array(new_topics)\n","b_topics = b_topics[in_data.documents]\n","C = sps.csr_matrix((len(b_topics),b_topics.max()+1))\n","for i in range(len(b_topics)):\n"," C[i,b_topics[i]] = 1\n","\n","# builds topic matrices\n","Pwc = in_data.Pwd.dot(C) # joint word + class probability\n","Pcc = ((C.T).dot(in_data.Pdd)).dot(C) # joint class + class probability\n","pc = Pcc.sum(axis=0)\n","\n","# show number of topics, and size\n","plt.bar(np.array(range(C.shape[1])),np.array(C.sum(axis=0))[0])\n","plt.xlabel(\"topic #\")\n","plt.ylabel(\"# of documents\");"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":540},"id":"hsaGdrRikKny","executionInfo":{"status":"ok","timestamp":1765745026841,"user_tz":-60,"elapsed":1484,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}},"outputId":"dc561891-4d40-4d5a-8ea5-17ef02e018fa"},"execution_count":40,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/scipy/sparse/_index.py:168: SparseEfficiencyWarning:\n","\n","Changing the sparsity structure of a csr_matrix is expensive. lil and dok are more efficient.\n","\n"]},{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["# extract measures\n","NMI = nmi_fn(Pwc)\n","Q = modularity_fn(Pcc)\n","Ncut = ncut_fn(Pcc)\n","rd = infomap_rank_fn(in_data.Pdd) # we need the PageRank vector first\n","Infomap = infomap_fn(C,in_data.Pdd,rd)\n","if (pc.shape[1]==1):\n"," com = 0\n","else:\n"," com = _infomap_fn(pc)/np.log(pc.shape[1])"],"metadata":{"id":"Wo4Ytt-DkOFT","executionInfo":{"status":"ok","timestamp":1765745033493,"user_tz":-60,"elapsed":3046,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}}},"execution_count":41,"outputs":[]},{"cell_type":"code","source":["# collect them in dataframe\n","pd.DataFrame(data = {'topics': C.shape[1], 'com': com,\n"," 'NMI': NMI, 'Q': Q, 'Ncut': Ncut, 'Infomap': Infomap}, index=[0])"],"metadata":{"id":"PY7lCxvqrJW1","colab":{"base_uri":"https://localhost:8080/","height":81},"executionInfo":{"status":"ok","timestamp":1765745034534,"user_tz":-60,"elapsed":16,"user":{"displayName":"Tomaso Erseghe","userId":"15955126948488574654"}},"outputId":"ee39d3b3-0263-4973-89ee-6a683e7c29fc"},"execution_count":42,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" topics com NMI Q Ncut Infomap\n","0 39 0.664808 0.29315 0.096722 0.857763 0.085053"],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
topicscomNMIQNcutInfomap
0390.6648080.293150.0967220.8577630.085053
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \" 'NMI': NMI, 'Q': Q, 'Ncut': Ncut, 'Infomap': Infomap}, index=[0])\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"topics\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 39,\n \"max\": 39,\n \"num_unique_values\": 1,\n \"samples\": [\n 39\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"com\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.6648078888653084,\n \"max\": 0.6648078888653084,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.6648078888653084\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"NMI\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.2931497725659325,\n \"max\": 0.2931497725659325,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.2931497725659325\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Q\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.09672155125395793,\n \"max\": 0.09672155125395793,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.09672155125395793\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Ncut\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.857762661569703,\n \"max\": 0.857762661569703,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.857762661569703\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Infomap\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.08505316873891888,\n \"max\": 0.08505316873891888,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.08505316873891888\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":42}]}],"metadata":{"colab":{"gpuType":"T4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"},"accelerator":"GPU"},"nbformat":4,"nbformat_minor":0}