代写report | 作业Algorithm – Frequent Itemsets

Frequent Itemsets

代写report | 作业Algorithm – 这道题目是利用Algorithm进行的编程代写任务, 是比较有代表性的report/Algorithm等代写方向

report代写 代做report

In [481]: from collections import defaultdict
import numpy as np
import time
def remove_false_positive(frequent_itemsets, s, filename):
r '''one pass all data to remove false positive'''
if frequent_itemsets:
sington = frozenset(frequent_itemsets[ 0 ])
with open(filename, 'r') as f:
count_table = defaultdict(int) # count all frequent itemset in full data set
for line in f:
# st = time.perf_counter()
items = frozenset([int(i.strip()) for i in line.split()])
items = items.intersection(sington)
for item in items:
count_table[item] += 1 # single item
for itemset in frequent_itemsets[ 1 :]:
if itemset.issubset(items):
count_table[itemset] += 1
# print("{} s".format(time.perf_counter() - st))
singleton = set()
frequent_itemsets = []
for itemset in count_table:
if type(itemset) is not frozenset:
if count_table[itemset] >= s:
singleton.add(itemset)
else :
if count_table[itemset] >= s:
frequent_itemsets.append(itemset)
frequent_itemsets.insert( 0 , frozenset(singleton))
return frequent_itemsets
def apriori(baskets, s):
frequent_itemsets = []
invert_index = defaultdict(set)
for i, basket in enumerate(baskets):
for item in basket:
invert_index[item].add(i) # id -> basket
sington = set()
for item in invert_index:
if len(invert_index[item]) >= s:
sington.add(item) # singleton frequent items
sington = frozenset(sington) # convert to frozenset
frequent_itemsets.append(sington) # add sington to all itemsets
nitems = 1 # number of items in a itemset
itemsets = list(sington)
while itemsets:
print(len(itemsets))
nitemsets = len(frequent_itemsets)
old_invert_index = invert_index
invert_index = defaultdict(frozenset)
record_table = defaultdict(bool)
for i in range(len(itemsets) - 1 ):
for j in range(i + 1 , len(itemsets)):
if type(itemsets[i]) is not frozenset:
k = frozenset([itemsets[i], itemsets[j]])
else :
k = itemsets[i].union(itemsets[j]) # build new itemset
if len(k) == nitems + 1 and not record_table[k]:
v = old_invert_index[itemsets[i]].intersection(old_invert_index[itemsets[j]])
invert_index[k] = frozenset(v)
if len(v) >= s:
frequent_itemsets.append(k)
record_table[k] = True
nitems += 1
itemsets = frequent_itemsets[nitemsets:]
return frequent_itemsets
def num_frequent_itemset(itemsets):
if itemsets:
return len(itemsets[ 0 ]) + len(itemsets[ 1 :])
return 0

In [417]: baskets = [] with open(‘mushroom.dat’, ‘r’) as f: for line in f: baskets.append([int(i.strip()) for i in line.split()]) frequent_itemsets = apriori(baskets, 5000 ) print(len(frequent_itemsets))

In [419]: def SimpleRandomized(p, m, s, filename): r ”’ simple randomized algorithm: p: the probability to sample m: the total count of data s: the support filname: the path of data file to read ”’ b = np.arange(m) np.random.shuffle(b) # reorder the baskets randomly b = b[:int(pm)] baskets = [] with open(filename, ‘r’) as f: ln = 0 for line in f: if ln in b: baskets.append([int(i.strip()) for i in line.split()]) ln += 1 frequent_itemsets = apriori(baskets, 0.9p*s) return remove_false_positive(frequent_itemsets, s, filename)

In [429]: import time def SON(p, m, s, filename): frequent_itemsets = [frozenset()] # all frequent items with open(filename, ‘r’) as f: chunk = 0 while True : baskets = [] for _ in range(int(pm)+ 1 ): line = f.readline() if not line: break # all data has been read baskets.append([int(i.strip()) for i in line.split()]) if baskets: # st = time.perf_counter() itemsets = apriori(baskets, ps) # get frequent itemsets for this chunk singleton = frequent_itemsets[ 0 ].union(itemsets[ 0 ]) frequent_itemsets = list(frozenset(frequent_itemsets[ 1 :]).union(frozenset(itemsets[ 1 :]))) frequent_itemsets.insert( 0 , singleton) # print(‘chunk {}: {} s’.format(chunk, time.perf_counter() – st)) if not line: break # all data has been read chunk += 1 return remove_false_positive(frequent_itemsets, s, filename)

In [430]: import pickle ############################################### filename = ‘T10I4D100K.dat’ p = 0. m = 100000 s = 1000 np.random.seed( 12345 ) frequent_itemsets1 = SimpleRandomized(p, m, s, filename) frequent_itemsets2 = SON(p, m, s, filename)

with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)

In [438]: print(‘ Dataset: {} ‘.format(‘T10I4D100K’)) print(‘Single Randomized: {} ‘.format(num_frequent_itemset(frequent_itemsets1))) print(‘ SON: {} ‘.format(num_frequent_itemset(frequent_itemsets2)))

In [455]: filename = ‘T40I10D100K.dat’ p = 0. m = 100000 s = 5000 np.random.seed( 12345 ) frequent_itemsets1 = SimpleRandomized(p, m, s, filename) frequent_itemsets2 = SON(p, m, s, filename) with open(‘frequent_itemsets_SR_ {} _ {} .pkl’.format(filename, p), ‘wb’) as f: pickle.dump(frequent_itemsets1, f) with open(‘frequent_itemsets_SON_ {} _ {} .pkl’.format(filename, p), ‘wb’) as f: pickle.dump(frequent_itemsets2, f)

35
Single Randomized: 383
SON: 385

In [461]: print(‘ Dataset: {} ‘.format(‘T40I10D100K’)) print(‘Single Randomized: {} ‘.format(num_frequent_itemset(frequent_itemsets1))) print(‘ SON: {} ‘.format(num_frequent_itemset(frequent_itemsets2)))

In [479]: filename = ‘chess.dat’ p = 0. m = 3196 s = 3190 np.random.seed( 12345 ) frequent_itemsets1 = SimpleRandomized(p, m, s, filename) frequent_itemsets2 = SON(p, m, s, filename)

with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)

In [480]: print(‘ Dataset: {} ‘.format(‘chess’)) print(‘Single Randomized: {} ‘.format(num_frequent_itemset(frequent_itemsets1))) print(‘ SON: {} ‘.format(num_frequent_itemset(frequent_itemsets2)))

In [ ]: filename = 'connect.dat'
p = 0.
m = 67557
s = 65000
np.random.seed( 12345 )
frequent_itemsets1 = SimpleRandomized(p, m, s, filename)
frequent_itemsets2 = SON(p, m, s, filename)
with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)
In [ ]: filename = 'mushroom.dat'
p = 0.
m = 8124
s = 1000
np.random.seed( 12345 )
frequent_itemsets1 = SimpleRandomized(p, m, s, filename)
frequent_itemsets2 = SON(p, m, s, filename)
with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)
In [ ]: filename = 'pumsb.dat'
p = 0.
m = 49046
s = 1000
np.random.seed( 12345 )
frequent_itemsets1 = SimpleRandomized(p, m, s, filename)
frequent_itemsets2 = SON(p, m, s, filename)
with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)
In [ ]: filename = 'pumsb_star.dat'
p = 0.
m = 49046
s = 1000
np.random.seed( 12345 )
frequent_itemsets1 = SimpleRandomized(p, m, s, filename)
frequent_itemsets2 = SON(p, m, s, filename)
with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)
Dataset: T40I10D100K
Single Randomized: 316
SON: 316
Dataset: chess
Single Randomized: 1
SON: 1
In [ ]: ps = [0.01, 0.02, 0.05, 0.1]
filename = 'T10I4D100K.dat'
m = 100000
s = 1000
for p in ps:
np.random.seed( 12345 )
frequent_itemsets1 = SimpleRandomized(p, m, s, filename)
frequent_itemsets2 = SON(p, m, s, filename)
with open('frequent_itemsets_SR_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets1, f)
with open('frequent_itemsets_SON_ {} _ {} .pkl'.format(filename, p), 'wb') as f:
pickle.dump(frequent_itemsets2, f)

report of the experiments

The most challenges I encountered is when running on some dataset i.e. chess.dat, connect.dat, the frequent itemset can contains many items, and the

candidate sets can be very large and it will take long time to run.

Clustering

P1: Perform a hierarchical clustering on the one-dimensional set of points 1,4,9,16,25,36,49,64,81.

In [50]: % matplotlib inline
import numpy as np
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram
def pairwise(m, n):
c = 0
for i in range( 0 , n- 1 ):
for j in range(i+ 1 , n):
if c == m:
return i, j
c += 1
return None
data = [ 1 , 4 , 9 , 16 , 25 , 36 , 49 , 64 , 81 ]
clusters = [[d] for d in data]
n = len(clusters)
cindex = list(range(n))
Z = np.ones((n- 1 , 4 ))
for i in range(n- 1 ):
# get the centroid of each cluster
centroids = np.array([np.mean(clusters[c]) for c in cindex])
# compute the pairwise distance
X = pdist(centroids[:, None ])
# the closest
m = np.argmin(X)
c1, c2 = pairwise(m, len(centroids))
c = clusters[cindex[c1]] + clusters[cindex[c2]]
clusters.append(c)
Z[i, 0 ] = cindex[c1]
Z[i, 1 ] = cindex[c2]
Z[i, 2 ] = X[m]
Z[i, 3 ] = len(c)
cindex[c1] = n+i
del cindex[c2]
ax = dendrogram(Z, labels=data)

P2: Implement the K-means Algorithm and carry out experiments on the provided Iris dataset

In [89]: import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
# load the data
data = load_iris()
# first 4 dimension
X = data['data']
# multiple models for different k-value
models_per_k = []
inertias = []
for k in range( 1 , 10 ):
# use KMeans to cluster the data
model = KMeans(n_clusters=k).fit(X)
models_per_k.append(model)
inertias.append(model.inertia_)
# plot the inertia with different k
fig, ax = plt.subplots()
ax.plot(range( 1 , 10 ), inertias, 'x-')
ax.set_xlabel('$k$')
ax.set_ylabel('inertia')
ax.scatter( 3 , inertias[ 2 ], color='', marker='o', edgecolors='r', s= 200 )
plt.show()

Using the elbow method to select a optimal , from the above plot, the x-axis is the value of and the y-aixs is the sum of within-cluster variance, the elbow

method select the turning point of the plot which is at.

k k
k = 3
In [70]: k = 3
# get the model for k=
model = models_per_k[k- 1 ]
# the cluster label
y_pred = model.labels_
# the cluster centoids
centroids = model.cluster_centers_
# plot the first two dimension
plt.scatter(X[:, 0 ], X[:, 1 ], c=y_pred)
# plot the centroids
plt.plot(centroids[:, 0 ], centroids[:, 1 ], 'rx', ms= 15 , mew= 3 )
plt.show()

Advertising

Suppose that there are three advertisers A,B, and C. There are three queries x, y, and z. Each advertiser has a budget of 2. Advertiser A only bids
on x, B bids on x and y, and C bids on x, y, and z. Note that on the query sequence xxyyzz, the optimal offline algorithm would yield a revenue of 6,
since all queries can be assigned.

P1: Show that the greedy algorithm will assign at least 4 of the 6 queries xxyyzz

Since B and C both bids on y, and each has a budget of 2, so there are four budgets that can be assigned to y. The first two x queries can consume at most two

budgets, so there at least are two budgets for y after the first two x queries. So the two y queries will both be assigned. and the greedy algorithm can assign at

least 4 of the 6 queries.

P2: Find another sequence of queries such that the greedy algorithm can assign as few as half the queries that the

optimal offline algorithm would assign to that sequence

Sequence xyzz, the offline algorithm would assign all four queries, but a greedy algorithm can only assign two queries, if the greedy algorithm assign C to xy.