Solutions exercices Session 1

Solution Exercice of Session 1 about collocations (multi-word expressions):

from collections import Counter
import nltk

with open("MR500train","r") as f: lines=f.readlines()
co=Counter()
for l in lines:
    l=l[2:].strip()
    wds = nltk.word_tokenize(l)
    # this is a fix, because "i" is not correctly tagged
    z = []
    for w in wds:
        if w=="i": z.append("I")
        else: z.append(w)
    wds = z
    pos = nltk.pos_tag(wds)
    assert len(wds)==len(pos)
    for i in range(len(wds)-1):
        # example with pattern Noun-Noun:
        if pos[i][1]=='NN' and pos[i+1][1]=='NN':
            bigr = wds[i]+"_"+wds[i+1]
            co.update([bigr])
print(co.most_common(20))

Solution Exercice of Session 1 about Byte-Pair Encoding:

from collections import Counter
import collections
import re

with open("MR500train","r") as f: lines=f.readlines()
co=Counter()
for l in lines:
    l=l[2:].strip()
    wds = l.split(" ")
    co.update(wds)
print("unigram %d " % (len(co),))

def get_stats(vocab):
    # dict that returns 0 for unknown keys
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

def merge_vocab(pair,v_in):
    v_out={}
    # just write the pair as a string
    bigram=re.escape(' '.join(pair))
    # 1st group: match if it is not preceded by \S = non-whitespace char; so only accepts bigram preceded by a whitespace or start of line
    # 3rd group: match if it is not followed by \S
    p=re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        # replace matches in word by the joined-pair
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

vocab = {}
for w in co: vocab[' '.join(w)+' </s>']=co[w]
num_merges = 100
for i in range(num_merges):
    pairs = get_stats(vocab)
    # pairs.get is the function that returns the frequency of a pair
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best,vocab)
    print(best)

# cf: https://medium.com/@makcedward/how-subword-helps-on-your-nlp-model-83dd1b836f46

See also