import numpy as np
import pandas as pd
data = pd.read_csv('filter_sizes2.csv', index_col=0)
data.head()
import math
import scipy.optimize
def expected_data_size(c, N, B, S):
"""
The expected size of data downloaded in bits if a client does not match a block.
c is the number of filter elements the client is watching
N is the number of filter elements in a block
S is the size of the full block
P is the Golomb-Rice parameter
Assuming filter sizes that are N * P bits, this returns the filter size plus block size times false
positive match probability.
"""
P = 2.0 ** B
M = 1.497137 * P
bits = B + 1.0 / (1 - (1 - 1.0 / M) ** P)
fp = (1 - (1 - 1.0 / (N * P))**(c*N))
return N * bits + 8 * S * fp
def generate_func(N, S):
"""
Generate a numpy-optimized function to compute expected data sizes. Takes numpy arrays with
N and S and returns a lambda accepting scalar c value.
"""
B = np.arange(1, 49)
P = 2.0 ** B
M = 1.497137 * P
bits = B + 1.0 / (1 - (1 - 1.0 / M) ** P)
Y = np.power(1 - 1.0 / np.outer(M, np.maximum(1, N)), N)
return lambda c: np.outer(bits, N) + 8 * S * (1 - Y ** c)
N = data['output_n'].values
S = data['size'].values
func = generate_func(N, S)
y10 = np.sum(func(10), 1)
y100 = np.sum(func(100), 1)
y1000 = np.sum(func(1000), 1)
y10000 = np.sum(func(10000), 1)
P = pd.RangeIndex(1, 49)
y10 = pd.Series(y10, index=P)
y100 = pd.Series(y100, index=P)
y1000 = pd.Series(y1000, index=P)
y10000 = pd.Series(y10000, index=P)
{
'M=10': y10.idxmin(),
'M=100': y100.idxmin(),
'M=1,000': y1000.idxmin(),
'M=10,000': y10000.idxmin(),
}
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(10,5))
plt.plot(y10, label='M=10')
plt.plot(y100, label='M=100')
plt.plot(y1000, label='M=1,000')
plt.plot(y10000, label='M=10,000')
plt.title('Expected data size (bits) vs P')
plt.legend()