This quick tutorial will show how to use OBSearch.
What you need first is to know the object you want to store. You may also need a distance function d that satisfies the triangle inequality. This is not a requirement for the GHS index. This function d compares objects and tells you how "far" or "close" they are from each other.
So we will store vectors of 100 dimensions, and we will calculate the 1-norm distance on them!
The following code shows how to create an OB object.
package net.obsearch.example.vectors;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.nio.ShortBuffer;
import java.util.Arrays;
import net.obsearch.asserts.OBAsserts;
import net.obsearch.constants.ByteConstants;
import net.obsearch.exception.OBException;
import net.obsearch.ob.OBFloat;
import net.obsearch.ob.OBInt;
import net.obsearch.ob.OBLong;
import net.obsearch.ob.OBShort;
import net.obsearch.utils.bytes.ByteConversion;
public class L1Float implements OBFloat {
private float[] vector;
public L1Float(){
// required by OBSearch
}
/**
* Construct an object from an array.
* @param vector
*/
public L1Float(float[] vector){
this.vector = vector;
}
/**
* Parses a string with numbers separated by spaces
* @param data
*/
public L1Float(String data)throws OBException{
String[] split = data.split("[ |,]");
vector = new float[split.length];
//OBAsserts.chkAssert(vector.length == 64, "Size wrong for vector: " + vector.length);
int i = 0;
for(String s : split){
vector[i] = Float.parseFloat(s);
i++;
}
}
@Override
public float distance(OBFloat object) throws OBException {
L1Float other = (L1Float)object;
int i = 0;
float res = 0;
OBAsserts.chkAssert(vector.length == other.vector.length, "Vector size mismatch");
while(i < vector.length){
res += Math.abs(vector[i] - other.vector[i]);
i++;
}
OBAsserts.chkAssert(res <= Long.MAX_VALUE, "max value exceeded");
return res;
}
@Override
public void load(byte[] input) throws OBException, IOException {
FloatBuffer s = ByteConversion.createByteBuffer(input).asFloatBuffer();
vector = new float[input.length / ByteConstants.Float.getSize()];
s.get(vector);
}
/**
* 6) Equals method. Implementation of the equals method is required. A
* casting error can happen here, but we don't check it for efficiency
* reasons.
* @param object
* The object to compare.
* @return true if this and object are equal.
*/
public final boolean equals(final Object object) {
L1Float o = (L1Float) object;
return Arrays.equals(vector, o.vector);
}
@Override
public byte[] store() throws OBException, IOException {
ByteBuffer b = ByteConversion.createByteBuffer(ByteConstants.Float.getSize() * vector.length);
FloatBuffer s = b.asFloatBuffer();
s.put(vector);
return b.array();
}
}
Now you can insert objects in an index and retrieve them.
package net.obsearch.example.vectors;
import hep.aida.bin.StaticBin1D;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import net.obsearch.ambient.Ambient;
import net.obsearch.ambient.bdb.AmbientBDBJe;
import net.obsearch.ambient.tc.AmbientTC;
import net.obsearch.exception.NotFrozenException;
import net.obsearch.exception.OBException;
import net.obsearch.exception.OBStorageException;
import net.obsearch.exception.PivotsUnavailableException;
import net.obsearch.index.ghs.impl.Sketch64Float;
import net.obsearch.index.ghs.impl.Sketch64Long;
import net.obsearch.index.utils.Directory;
import net.obsearch.pivots.AcceptAll;
import net.obsearch.pivots.bustos.impl.IncrementalBustosNavarroChavezShort;
import net.obsearch.pivots.rf02.RF02PivotSelectorShort;
import net.obsearch.pivots.rf03.RF03PivotSelectorLong;
import net.obsearch.pivots.rf03.RF03PivotSelectorShort;
import net.obsearch.pivots.rf04.RF04PivotSelectorFloat;
import net.obsearch.query.OBQueryFloat;
import net.obsearch.query.OBQueryLong;
import net.obsearch.result.OBPriorityQueueFloat;
import net.obsearch.result.OBPriorityQueueLong;
import net.obsearch.result.OBPriorityQueueShort;
import net.obsearch.result.OBResultShort;
public class VectorsDemoGHS extends VectorsDemo {
public static void main(String args[]) throws FileNotFoundException, OBStorageException, NotFrozenException, IllegalAccessException, InstantiationException, OBException, IOException, PivotsUnavailableException {
init();
// Delete the directory of the index just in case.
Directory.deleteDirectory(INDEX_FOLDER);
// Create the pivot selection strategy
RF04PivotSelectorFloat<L1Float> sel = new RF04PivotSelectorFloat<L1Float>(new AcceptAll<L1Float>());
sel.setDataSample(400);
// make the bit set as short so that m objects can fit in the buckets.
// create an index.
// Choose pivot sizes that are multiples of 64 to optimize the space
Sketch64Float<L1Float> index = new Sketch64Float<L1Float>(L1Float.class, sel, 256);
// error expected
index.setExpectedError(1.40);
// small if you are planning to insert a lot of objects!
index.setSampleSize(100);
// Probability of returning an error within 1.40 times the real distance
// (measured in standard deviations) (3 means a prob. of 0.99)
index.setKAlpha(ALPHA);
// select the ks that the user will call.
// This example will only be called with k=1
index.setMaxK(new int[]{1});
// little optimization that can help if your objects are of the same size.
index.setFixedRecord(true);
index.setFixedRecord(VEC_SIZE*4);
// Create the ambient that will store the index's data. (NOTE: folder name is hardcoded)
Ambient<L1Float, Sketch64Float<L1Float>> a = new AmbientTC<L1Float, Sketch64Float<L1Float>>( index, INDEX_FOLDER );
// Add some random objects to the index:
logger.info("Adding " + DB_SIZE + " objects...");
int i = 0;
while(i < DB_SIZE){
index.insert(generateFloatVector());
if(i % 100000 == 0){
logger.info("Loading: " + i);
}
i++;
}
// prepare the index
logger.info("Preparing the index...");
a.freeze();
logger.info("YAY! stats: " + index.getStats());
// now we can match some objects!
logger.info("Querying the index...");
i = 0;
index.resetStats(); // reset the stats counter
long start = System.currentTimeMillis();
List<OBPriorityQueueFloat<L1Float>> queryResults = new ArrayList<OBPriorityQueueFloat<L1Float>>(QUERY_SIZE);
List<L1Float> queries = new ArrayList<L1Float>(QUERY_SIZE);
while(i < QUERY_SIZE){
L1Float q = generateFloatVector();
// query the index with k=1
OBPriorityQueueFloat<L1Float> queue = new OBPriorityQueueFloat<L1Float>(1);
// perform a query with a large range and k = 1
index.searchOB(q, Float.MAX_VALUE, queue);
queryResults.add(queue);
queries.add(q);
i++;
}
// print the results of the set of queries.
long elapsed = System.currentTimeMillis() - start;
logger.info("Time per query: " + elapsed / QUERY_SIZE + " millisec.");
logger.info("Stats follow: (total distances / pivot vectors computed during the experiment)");
logger.info(index.getStats().toString());
// now we validate the result of the search
logger.info("Doing Error validation");
StaticBin1D ep = new StaticBin1D();
Iterator<OBPriorityQueueFloat<L1Float>> it1 = queryResults.iterator();
Iterator<L1Float> it2 = queries.iterator();
StaticBin1D seqTime = new StaticBin1D();
i = 0;
while(it1.hasNext()){
OBPriorityQueueFloat<L1Float> qu = it1.next();
L1Float q = it2.next();
long time = System.currentTimeMillis();
float[] sortedList = index.fullMatchLite(q, false);
long el = System.currentTimeMillis() - time;
seqTime.add(el);
logger.info("Elapsed: " + el + " " + i);
OBQueryFloat<L1Float> queryObj = new OBQueryFloat<L1Float >(q, Float.MAX_VALUE, qu, null);
ep.add(queryObj.approx(sortedList));
i++;
}
logger.info(ep.toString());
logger.info("Time per seq query: ");
logger.info(seqTime.toString());
}
}
To run the previous demo simply do:
java -classpath obsearch-with-dependencies.jar net.obsearch.example.vectors.VectorsDemoGHS
