Solution Exercice of Session 1 about collocations (multi-word expressions):
from collections import Counter
import nltk
with open("MR500train","r") as f: lines=f.readlines()
co=Counter()
for l in lines:
l=l[2:].strip()
wds = nltk.word_tokenize(l)
# this is a fix, because "i" is not correctly tagged
z = []
for w in wds:
if w=="i": z.append("I")
else: z.append(w)
wds = z
pos = nltk.pos_tag(wds)
assert len(wds)==len(pos)
for i in range(len(wds)-1):
# example with pattern Noun-Noun:
if pos[i][1]=='NN' and pos[i+1][1]=='NN':
bigr = wds[i]+"_"+wds[i+1]
co.update([bigr])
print(co.most_common(20))Solution Exercice of Session 1 about Byte-Pair Encoding:
from collections import Counter
import collections
import re
with open("MR500train","r") as f: lines=f.readlines()
co=Counter()
for l in lines:
l=l[2:].strip()
wds = l.split(" ")
co.update(wds)
print("unigram %d " % (len(co),))
def get_stats(vocab):
# dict that returns 0 for unknown keys
pairs = collections.defaultdict(int)
for word, freq in vocab.items():
symbols = word.split()
for i in range(len(symbols)-1):
pairs[symbols[i],symbols[i+1]] += freq
return pairs
def merge_vocab(pair,v_in):
v_out={}
# just write the pair as a string
bigram=re.escape(' '.join(pair))
# 1st group: match if it is not preceded by \S = non-whitespace char; so only accepts bigram preceded by a whitespace or start of line
# 3rd group: match if it is not followed by \S
p=re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
for word in v_in:
# replace matches in word by the joined-pair
w_out = p.sub(''.join(pair), word)
v_out[w_out] = v_in[word]
return v_out
vocab = {}
for w in co: vocab[' '.join(w)+' </s>']=co[w]
num_merges = 100
for i in range(num_merges):
pairs = get_stats(vocab)
# pairs.get is the function that returns the frequency of a pair
best = max(pairs, key=pairs.get)
vocab = merge_vocab(best,vocab)
print(best)
# cf: https://medium.com/@makcedward/how-subword-helps-on-your-nlp-model-83dd1b836f46