# Load stuff

In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import re
!pip install ftfy --quiet
import ftfy
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, RobertaTokenizer, RobertaModel, AutoModelForSequenceClassification
!pip install liwc --quiet
import liwc
import spacy
from spacy import displacy

# check if we have cuda installed
if torch.cuda.is_available():
    # to use GPU
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('GPU is:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [2]:
class my_liwc():
    def __init__(self, liwc_file):
        parse, category_names = liwc.load_token_parser(liwc_file)
        self.parse = parse
        self.category_names =category_names
        self.nlp = spacy.load("en_core_web_sm")

    # check category correspondence
    def is_category(self,category_list):
        for c in category_list:
            if ((c in self.category_names) == False):
                print("Sorry, " + c + " is not in the current LIWC list!")

    # print categories
    def categories(self):
        for c in self.category_names:
            print(c)

    # get LIWC value for a sentence and for a list of categories
    def get_categories(self,categories,text):
      count = np.array([0]*len(categories))
      doc = self.nlp(text)
      for token in doc:
        pa = self.parse(token.text.lower())
        for i in range(len(categories)):
          if categories[i] in self.parse(token.text.lower()):
            count[i] += 1
      return count/len(doc) #/0.01 # we do not multiply by 100!!!

In [3]:
class my_bertagent():
    def __init__(self):
        model_name = "EnchantedStardust/bertagent-best"
        revision = "5bae55efbd95dd51759d275410cea36c81109227"
        # tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                       do_lower_case = True,
                                                       revision = revision)
        # roberta model
        self.model = AutoModelForSequenceClassification.from_pretrained(
                model_name,
                num_labels = 1,
                revision = revision
            ).to(device)

    def extract_ig(self, text, batch_size=20):

        # clean sentences
        text = self._clean_sentences(text)
        # tokenize documents
        tokenized_text = self.tokenizer(
            text,
            add_special_tokens = True,
            padding = "max_length",
            truncation = True,
            max_length = 128,
            return_attention_mask = True,
            return_tensors = 'pt',
            return_token_type_ids = False,
            return_offsets_mapping = False,
        )

        # create the dataloader
        data = TensorDataset(tokenized_text['input_ids'],
                             tokenized_text['attention_mask'])
        dataloader = DataLoader(data,
                                batch_size = batch_size)

        self.model = self.model.eval()
        out = []
        i = 0 # counter, workaround since torch does not support text
        for batch in tqdm(dataloader, desc="Evaluating"):
            # load data from dataloader
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            # extract embeddings and baseline embeddings
            b_embedd = self.model.roberta.embeddings.word_embeddings(
                                                   b_input_ids).detach()
            # predictions
            agency = self.model.forward(inputs_embeds = b_embedd,
                                        attention_mask = b_input_mask)

            out.append(agency["logits"].cpu().detach().numpy())

        out = np.concatenate(out)
        return out

    def _clean_sentences(self, sentences):
        sentences = [re.sub(r"\s\s+", " ", sent).strip() for sent in sentences]
        sentences = [ftfy.fix_text(sent) for sent in sentences] # fix text
        return sentences

# Load data

In [4]:
# load dataframe
df = pd.read_excel("Lab2_data.xlsx")
df

Unnamed: 0,id_text,text
0,0,"while parking at a doctors office, i pulled in..."
1,0,Once i had parked i reied to say sorry as i co...
2,0,It was after all a genuine mistake on my park ...
3,0,but this person was in such a foul mood i coul...
4,1,My friend told me I am an idiot.
...,...,...
2836,503,none of us fell and i started walking again th...
2837,503,"but the lady started screaming, addressing me ..."
2838,504,I am talking to a saleswoman in a store and sh...
2839,504,I feel hurt and helpless.


# Get agency through BERTAgent

In [5]:
# run BERTAgent ... it's very slow without GPU
df['agency'] = my_bertagent().extract_ig(df['text'])
# show the result
df

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Evaluating:   0%|          | 0/143 [00:00<?, ?it/s]

Unnamed: 0,id_text,text,agency
0,0,"while parking at a doctors office, i pulled in...",-0.191778
1,0,Once i had parked i reied to say sorry as i co...,-0.055492
2,0,It was after all a genuine mistake on my park ...,-0.190038
3,0,but this person was in such a foul mood i coul...,-0.335492
4,1,My friend told me I am an idiot.,-0.190367
...,...,...,...
2836,503,none of us fell and i started walking again th...,0.265351
2837,503,"but the lady started screaming, addressing me ...",-0.095687
2838,504,I am talking to a saleswoman in a store and sh...,-0.093656
2839,504,I feel hurt and helpless.,-0.568953


# Get additional markers via (simulated) LIWC

In [6]:
# chosen markers
mark_list = ['anx','posemo','negemo','swear','anger']
# check they exist
lw0 = my_liwc("LIWC2007_English100131.dic")
lw0.is_category(mark_list)
# initialize columns
for i in mark_list:
  df[i] = 0.0
# run LIWC
for i in tqdm(range(len(df))):
  v = lw0.get_categories(mark_list,df.loc[i,'text'])
  for j in range(len(mark_list)):
    df.loc[i, mark_list[j]] = v[j]
# show result
df

  0%|          | 0/2841 [00:00<?, ?it/s]

Unnamed: 0,id_text,text,agency,anx,posemo,negemo,swear,anger
0,0,"while parking at a doctors office, i pulled in...",-0.191778,0.0,0.041667,0.000000,0.0,0.000000
1,0,Once i had parked i reied to say sorry as i co...,-0.055492,0.0,0.024390,0.048780,0.0,0.024390
2,0,It was after all a genuine mistake on my park ...,-0.190038,0.0,0.000000,0.133333,0.0,0.000000
3,0,but this person was in such a foul mood i coul...,-0.335492,0.0,0.062500,0.000000,0.0,0.000000
4,1,My friend told me I am an idiot.,-0.190367,0.0,0.111111,0.111111,0.0,0.111111
...,...,...,...,...,...,...,...,...
2836,503,none of us fell and i started walking again th...,0.265351,0.0,0.000000,0.000000,0.0,0.000000
2837,503,"but the lady started screaming, addressing me ...",-0.095687,0.0,0.000000,0.100000,0.0,0.050000
2838,504,I am talking to a saleswoman in a store and sh...,-0.093656,0.0,0.000000,0.066667,0.0,0.066667
2839,504,I feel hurt and helpless.,-0.568953,0.0,0.000000,0.333333,0.0,0.000000


In [7]:
lw0.categories()

funct
pronoun
ppron
i
we
you
shehe
they
ipron
article
verb
auxverb
past
present
future
adverb
preps
conj
negate
quant
number
swear
social
family
friend
humans
affect
posemo
negemo
anx
anger
sad
cogmech
insight
cause
discrep
tentat
certain
inhib
incl
excl
percept
see
hear
feel
bio
body
health
sexual
ingest
relativ
motion
space
time
work
achieve
leisure
home
money
relig
death
assent
nonfl
filler
