About
With a feature hashing function, the number of bucket becomes the number of features leading to a dimension reduction
Articles Related
Python
from collections import defaultdict
import hashlib
def hashFunction(numBuckets, rawFeats, printMapping=False):
"""Calculate a feature dictionary for an observation's features based on hashing.
Note:
Use printMapping=True for debug purposes and to better understand how the hashing works.
Args:
numBuckets (int): Number of buckets to use as features.
rawFeats (list of (int, str)): A list of features for an observation. Represented as
(featureID, value) tuples.
printMapping (bool, optional): If true, the mappings of featureString to index will be
printed.
Returns:
dict of int to float: The keys will be integers which represent the buckets that the
features have been hashed to. The value for a given key will contain the count of the
(featureID, value) tuples that have hashed to that key.
"""
mapping = {}
for ind, category in rawFeats:
featureString = category + str(ind)
mapping[featureString] = int(int(hashlib.md5(featureString).hexdigest(), 16) % numBuckets)
if(printMapping): print mapping
sparseFeatures = defaultdict(float)
for bucket in mapping.values():
sparseFeatures[bucket] += 1.0
return dict(sparseFeatures)
sampleOne = [(0, 'blue'), (1, 'Open')]
sampleTwo = [(0, 'red'), (1, 'Closed'), (2, 'Big')]
sampleThree = [(0, 'purple'), (1, 'Running'), (2, 'Small') ]
# Use four buckets
sampOneFourBuckets = hashFunction(4, sampleOne, True)
sampTwoFourBuckets = hashFunction(4, sampleTwo, True)
sampThreeFourBuckets = hashFunction(4, sampleThree, True)
# Use one hundred buckets
sampOneHundredBuckets = hashFunction(100, sampleOne, True)
sampTwoHundredBuckets = hashFunction(100, sampleTwo, True)
sampThreeHundredBuckets = hashFunction(100, sampleThree, True)
print '\t\t 4 Buckets \t\t\t 100 Buckets'
print 'SampleOne:\t {0}\t\t {1}'.format(sampOneFourBuckets, sampOneHundredBuckets)
print 'SampleTwo:\t {0}\t\t {1}'.format(sampTwoFourBuckets, sampTwoHundredBuckets)
print 'SampleThree:\t {0}\t {1}'.format(sampThreeFourBuckets, sampThreeHundredBuckets)