# 代写report | 作业Algorithm – Frequent Itemsets

### Frequent Itemsets

``````In [481]: from collections import defaultdict
import numpy as np
import time
``````
``````def remove_false_positive(frequent_itemsets, s, filename):
r '''one pass all data to remove false positive'''
if frequent_itemsets:
sington = frozenset(frequent_itemsets[ 0 ])
with open(filename, 'r') as f:
count_table = defaultdict(int) # count all frequent itemset in full data set
for line in f:
# st = time.perf_counter()
items = frozenset([int(i.strip()) for i in line.split()])
items = items.intersection(sington)
for item in items:
count_table[item] += 1 # single item
for itemset in frequent_itemsets[ 1 :]:
if itemset.issubset(items):
count_table[itemset] += 1
# print("{} s".format(time.perf_counter() - st))
singleton = set()
frequent_itemsets = []
for itemset in count_table:
if type(itemset) is not frozenset:
if count_table[itemset] >= s:
else :
if count_table[itemset] >= s:
frequent_itemsets.append(itemset)
frequent_itemsets.insert( 0 , frozenset(singleton))
return frequent_itemsets
``````
``````def apriori(baskets, s):
frequent_itemsets = []
``````
``````invert_index = defaultdict(set)
``````
``````sington = set()
for item in invert_index:
if len(invert_index[item]) >= s:
sington = frozenset(sington) # convert to frozenset
frequent_itemsets.append(sington) # add sington to all itemsets
``````
``````nitems = 1 # number of items in a itemset
itemsets = list(sington)
while itemsets:
print(len(itemsets))
nitemsets = len(frequent_itemsets)
old_invert_index = invert_index
invert_index = defaultdict(frozenset)
record_table = defaultdict(bool)
for i in range(len(itemsets) - 1 ):
for j in range(i + 1 , len(itemsets)):
if type(itemsets[i]) is not frozenset:
k = frozenset([itemsets[i], itemsets[j]])
else :
k = itemsets[i].union(itemsets[j]) # build new itemset
if len(k) == nitems + 1 and not record_table[k]:
v = old_invert_index[itemsets[i]].intersection(old_invert_index[itemsets[j]])
invert_index[k] = frozenset(v)
if len(v) >= s:
frequent_itemsets.append(k)
record_table[k] = True
nitems += 1
itemsets = frequent_itemsets[nitemsets:]
return frequent_itemsets
``````
``````def num_frequent_itemset(itemsets):
if itemsets:
return len(itemsets[ 0 ]) + len(itemsets[ 1 :])
return 0
``````

In [417]: baskets = [] with open(‘mushroom.dat’, ‘r’) as f: for line in f: baskets.append([int(i.strip()) for i in line.split()]) frequent_itemsets = apriori(baskets, 5000 ) print(len(frequent_itemsets))

In [419]: def SimpleRandomized(p, m, s, filename): r ”’ simple randomized algorithm: p: the probability to sample m: the total count of data s: the support filname: the path of data file to read ”’ b = np.arange(m) np.random.shuffle(b) # reorder the baskets randomly b = b[:int(pm)] baskets = [] with open(filename, ‘r’) as f: ln = 0 for line in f: if ln in b: baskets.append([int(i.strip()) for i in line.split()]) ln += 1 frequent_itemsets = apriori(baskets, 0.9p*s) return remove_false_positive(frequent_itemsets, s, filename)

In [429]: import time def SON(p, m, s, filename): frequent_itemsets = [frozenset()] # all frequent items with open(filename, ‘r’) as f: chunk = 0 while True : baskets = [] for _ in range(int(pm)+ 1 ): line = f.readline() if not line: break # all data has been read baskets.append([int(i.strip()) for i in line.split()]) if baskets: # st = time.perf_counter() itemsets = apriori(baskets, ps) # get frequent itemsets for this chunk singleton = frequent_itemsets[ 0 ].union(itemsets[ 0 ]) frequent_itemsets = list(frozenset(frequent_itemsets[ 1 :]).union(frozenset(itemsets[ 1 :]))) frequent_itemsets.insert( 0 , singleton) # print(‘chunk {}: {} s’.format(chunk, time.perf_counter() – st)) if not line: break # all data has been read chunk += 1 return remove_false_positive(frequent_itemsets, s, filename)

In [430]: import pickle ############################################### filename = ‘T10I4D100K.dat’ p = 0. m = 100000 s = 1000 np.random.seed( 12345 ) frequent_itemsets1 = SimpleRandomized(p, m, s, filename) frequent_itemsets2 = SON(p, m, s, filename)

``````with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)
``````

In [438]: print(‘ Dataset: {} ‘.format(‘T10I4D100K’)) print(‘Single Randomized: {} ‘.format(num_frequent_itemset(frequent_itemsets1))) print(‘ SON: {} ‘.format(num_frequent_itemset(frequent_itemsets2)))

In [455]: filename = ‘T40I10D100K.dat’ p = 0. m = 100000 s = 5000 np.random.seed( 12345 ) frequent_itemsets1 = SimpleRandomized(p, m, s, filename) frequent_itemsets2 = SON(p, m, s, filename) with open(‘frequent_itemsets_SR_ {} _ {} .pkl’.format(filename, p), ‘wb’) as f: pickle.dump(frequent_itemsets1, f) with open(‘frequent_itemsets_SON_ {} _ {} .pkl’.format(filename, p), ‘wb’) as f: pickle.dump(frequent_itemsets2, f)

###### 35
``````Single Randomized: 383
SON: 385
``````

In [461]: print(‘ Dataset: {} ‘.format(‘T40I10D100K’)) print(‘Single Randomized: {} ‘.format(num_frequent_itemset(frequent_itemsets1))) print(‘ SON: {} ‘.format(num_frequent_itemset(frequent_itemsets2)))

In [479]: filename = ‘chess.dat’ p = 0. m = 3196 s = 3190 np.random.seed( 12345 ) frequent_itemsets1 = SimpleRandomized(p, m, s, filename) frequent_itemsets2 = SON(p, m, s, filename)

``````with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)
``````

In [480]: print(‘ Dataset: {} ‘.format(‘chess’)) print(‘Single Randomized: {} ‘.format(num_frequent_itemset(frequent_itemsets1))) print(‘ SON: {} ‘.format(num_frequent_itemset(frequent_itemsets2)))

``````In [ ]: filename = 'connect.dat'
p = 0.
m = 67557
s = 65000
np.random.seed( 12345 )
frequent_itemsets1 = SimpleRandomized(p, m, s, filename)
frequent_itemsets2 = SON(p, m, s, filename)
``````
``````with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)
``````
``````In [ ]: filename = 'mushroom.dat'
p = 0.
m = 8124
s = 1000
np.random.seed( 12345 )
frequent_itemsets1 = SimpleRandomized(p, m, s, filename)
frequent_itemsets2 = SON(p, m, s, filename)
``````
``````with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)
``````
``````In [ ]: filename = 'pumsb.dat'
p = 0.
m = 49046
s = 1000
np.random.seed( 12345 )
frequent_itemsets1 = SimpleRandomized(p, m, s, filename)
frequent_itemsets2 = SON(p, m, s, filename)
``````
``````with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)
``````
``````In [ ]: filename = 'pumsb_star.dat'
p = 0.
m = 49046
s = 1000
np.random.seed( 12345 )
frequent_itemsets1 = SimpleRandomized(p, m, s, filename)
frequent_itemsets2 = SON(p, m, s, filename)
``````
``````with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)
``````
``````Dataset: T40I10D100K
Single Randomized: 316
SON: 316
``````
``````Dataset: chess
Single Randomized: 1
SON: 1
``````
``````In [ ]: ps = [0.01, 0.02, 0.05, 0.1]
filename = 'T10I4D100K.dat'
m = 100000
s = 1000
for p in ps:
np.random.seed( 12345 )
frequent_itemsets1 = SimpleRandomized(p, m, s, filename)
frequent_itemsets2 = SON(p, m, s, filename)
``````
``````with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)
``````

#### report of the experiments

The most challenges I encountered is when running on some dataset i.e. chess.dat, connect.dat, the frequent itemset can contains many items, and the

candidate sets can be very large and it will take long time to run.

### Clustering

#### P1: Perform a hierarchical clustering on the one-dimensional set of points 1,4,9,16,25,36,49,64,81.

``````In [50]: % matplotlib inline
import numpy as np
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram
``````
``````def pairwise(m, n):
c = 0
for i in range( 0 , n- 1 ):
for j in range(i+ 1 , n):
if c == m:
return i, j
c += 1
return None
``````
``````data = [ 1 , 4 , 9 , 16 , 25 , 36 , 49 , 64 , 81 ]
clusters = [[d] for d in data]
n = len(clusters)
cindex = list(range(n))
Z = np.ones((n- 1 , 4 ))
for i in range(n- 1 ):
# get the centroid of each cluster
centroids = np.array([np.mean(clusters[c]) for c in cindex])
# compute the pairwise distance
X = pdist(centroids[:, None ])
# the closest
m = np.argmin(X)
c1, c2 = pairwise(m, len(centroids))
c = clusters[cindex[c1]] + clusters[cindex[c2]]
clusters.append(c)
Z[i, 0 ] = cindex[c1]
Z[i, 1 ] = cindex[c2]
Z[i, 2 ] = X[m]
Z[i, 3 ] = len(c)
cindex[c1] = n+i
del cindex[c2]
ax = dendrogram(Z, labels=data)
``````

#### P2: Implement the K-means Algorithm and carry out experiments on the provided Iris dataset

``````In [89]: import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# first 4 dimension
X = data['data']
``````
``````# multiple models for different k-value
models_per_k = []
inertias = []
for k in range( 1 , 10 ):
# use KMeans to cluster the data
model = KMeans(n_clusters=k).fit(X)
models_per_k.append(model)
inertias.append(model.inertia_)
# plot the inertia with different k
fig, ax = plt.subplots()
ax.plot(range( 1 , 10 ), inertias, 'x-')
ax.set_xlabel('\$k\$')
ax.set_ylabel('inertia')
ax.scatter( 3 , inertias[ 2 ], color='', marker='o', edgecolors='r', s= 200 )
plt.show()
``````

Using the elbow method to select a optimal , from the above plot, the x-axis is the value of and the y-aixs is the sum of within-cluster variance, the elbow

method select the turning point of the plot which is at.

##### k = 3
``````In [70]: k = 3
# get the model for k=
model = models_per_k[k- 1 ]
# the cluster label
y_pred = model.labels_
# the cluster centoids
centroids = model.cluster_centers_
# plot the first two dimension
plt.scatter(X[:, 0 ], X[:, 1 ], c=y_pred)
# plot the centroids
plt.plot(centroids[:, 0 ], centroids[:, 1 ], 'rx', ms= 15 , mew= 3 )
plt.show()
``````

``````Suppose that there are three advertisers A,B, and C. There are three queries x, y, and z. Each advertiser has a budget of 2. Advertiser A only bids
on x, B bids on x and y, and C bids on x, y, and z. Note that on the query sequence xxyyzz, the optimal offline algorithm would yield a revenue of 6,
since all queries can be assigned.
``````

#### P1: Show that the greedy algorithm will assign at least 4 of the 6 queries xxyyzz

Since B and C both bids on y, and each has a budget of 2, so there are four budgets that can be assigned to y. The first two x queries can consume at most two

budgets, so there at least are two budgets for y after the first two x queries. So the two y queries will both be assigned. and the greedy algorithm can assign at

least 4 of the 6 queries.

#### optimal offline algorithm would assign to that sequence

Sequence xyzz, the offline algorithm would assign all four queries, but a greedy algorithm can only assign two queries, if the greedy algorithm assign C to xy.