This quick tutorial will show how to use OBSearch.
What you need first is to know the object you want to store. You may also need a distance function d that satisfies the triangle inequality. This is not a requirement for the GHS index. This function d compares objects and tells you how "far" or "close" they are from each other.
So we will store vectors of 100 dimensions, and we will calculate the 1-norm distance on them!
The following code shows how to create an OB object.
package net.obsearch.example.vectors; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.FloatBuffer; import java.nio.IntBuffer; import java.nio.ShortBuffer; import java.util.Arrays; import net.obsearch.asserts.OBAsserts; import net.obsearch.constants.ByteConstants; import net.obsearch.exception.OBException; import net.obsearch.ob.OBFloat; import net.obsearch.ob.OBInt; import net.obsearch.ob.OBLong; import net.obsearch.ob.OBShort; import net.obsearch.utils.bytes.ByteConversion; public class L1Float implements OBFloat { private float[] vector; public L1Float(){ // required by OBSearch } /** * Construct an object from an array. * @param vector */ public L1Float(float[] vector){ this.vector = vector; } /** * Parses a string with numbers separated by spaces * @param data */ public L1Float(String data)throws OBException{ String[] split = data.split("[ |,]"); vector = new float[split.length]; //OBAsserts.chkAssert(vector.length == 64, "Size wrong for vector: " + vector.length); int i = 0; for(String s : split){ vector[i] = Float.parseFloat(s); i++; } } @Override public float distance(OBFloat object) throws OBException { L1Float other = (L1Float)object; int i = 0; float res = 0; OBAsserts.chkAssert(vector.length == other.vector.length, "Vector size mismatch"); while(i < vector.length){ res += Math.abs(vector[i] - other.vector[i]); i++; } OBAsserts.chkAssert(res <= Long.MAX_VALUE, "max value exceeded"); return res; } @Override public void load(byte[] input) throws OBException, IOException { FloatBuffer s = ByteConversion.createByteBuffer(input).asFloatBuffer(); vector = new float[input.length / ByteConstants.Float.getSize()]; s.get(vector); } /** * 6) Equals method. Implementation of the equals method is required. A * casting error can happen here, but we don't check it for efficiency * reasons. * @param object * The object to compare. * @return true if this and object are equal. */ public final boolean equals(final Object object) { L1Float o = (L1Float) object; return Arrays.equals(vector, o.vector); } @Override public byte[] store() throws OBException, IOException { ByteBuffer b = ByteConversion.createByteBuffer(ByteConstants.Float.getSize() * vector.length); FloatBuffer s = b.asFloatBuffer(); s.put(vector); return b.array(); } }
Now you can insert objects in an index and retrieve them.
package net.obsearch.example.vectors; import hep.aida.bin.StaticBin1D; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import net.obsearch.ambient.Ambient; import net.obsearch.ambient.bdb.AmbientBDBJe; import net.obsearch.ambient.tc.AmbientTC; import net.obsearch.exception.NotFrozenException; import net.obsearch.exception.OBException; import net.obsearch.exception.OBStorageException; import net.obsearch.exception.PivotsUnavailableException; import net.obsearch.index.ghs.impl.Sketch64Float; import net.obsearch.index.ghs.impl.Sketch64Long; import net.obsearch.index.utils.Directory; import net.obsearch.pivots.AcceptAll; import net.obsearch.pivots.bustos.impl.IncrementalBustosNavarroChavezShort; import net.obsearch.pivots.rf02.RF02PivotSelectorShort; import net.obsearch.pivots.rf03.RF03PivotSelectorLong; import net.obsearch.pivots.rf03.RF03PivotSelectorShort; import net.obsearch.pivots.rf04.RF04PivotSelectorFloat; import net.obsearch.query.OBQueryFloat; import net.obsearch.query.OBQueryLong; import net.obsearch.result.OBPriorityQueueFloat; import net.obsearch.result.OBPriorityQueueLong; import net.obsearch.result.OBPriorityQueueShort; import net.obsearch.result.OBResultShort; public class VectorsDemoGHS extends VectorsDemo { public static void main(String args[]) throws FileNotFoundException, OBStorageException, NotFrozenException, IllegalAccessException, InstantiationException, OBException, IOException, PivotsUnavailableException { init(); // Delete the directory of the index just in case. Directory.deleteDirectory(INDEX_FOLDER); // Create the pivot selection strategy RF04PivotSelectorFloat<L1Float> sel = new RF04PivotSelectorFloat<L1Float>(new AcceptAll<L1Float>()); sel.setDataSample(400); // make the bit set as short so that m objects can fit in the buckets. // create an index. // Choose pivot sizes that are multiples of 64 to optimize the space Sketch64Float<L1Float> index = new Sketch64Float<L1Float>(L1Float.class, sel, 256); // error expected index.setExpectedError(1.40); // small if you are planning to insert a lot of objects! index.setSampleSize(100); // Probability of returning an error within 1.40 times the real distance // (measured in standard deviations) (3 means a prob. of 0.99) index.setKAlpha(ALPHA); // select the ks that the user will call. // This example will only be called with k=1 index.setMaxK(new int[]{1}); // little optimization that can help if your objects are of the same size. index.setFixedRecord(true); index.setFixedRecord(VEC_SIZE*4); // Create the ambient that will store the index's data. (NOTE: folder name is hardcoded) Ambient<L1Float, Sketch64Float<L1Float>> a = new AmbientTC<L1Float, Sketch64Float<L1Float>>( index, INDEX_FOLDER ); // Add some random objects to the index: logger.info("Adding " + DB_SIZE + " objects..."); int i = 0; while(i < DB_SIZE){ index.insert(generateFloatVector()); if(i % 100000 == 0){ logger.info("Loading: " + i); } i++; } // prepare the index logger.info("Preparing the index..."); a.freeze(); logger.info("YAY! stats: " + index.getStats()); // now we can match some objects! logger.info("Querying the index..."); i = 0; index.resetStats(); // reset the stats counter long start = System.currentTimeMillis(); List<OBPriorityQueueFloat<L1Float>> queryResults = new ArrayList<OBPriorityQueueFloat<L1Float>>(QUERY_SIZE); List<L1Float> queries = new ArrayList<L1Float>(QUERY_SIZE); while(i < QUERY_SIZE){ L1Float q = generateFloatVector(); // query the index with k=1 OBPriorityQueueFloat<L1Float> queue = new OBPriorityQueueFloat<L1Float>(1); // perform a query with a large range and k = 1 index.searchOB(q, Float.MAX_VALUE, queue); queryResults.add(queue); queries.add(q); i++; } // print the results of the set of queries. long elapsed = System.currentTimeMillis() - start; logger.info("Time per query: " + elapsed / QUERY_SIZE + " millisec."); logger.info("Stats follow: (total distances / pivot vectors computed during the experiment)"); logger.info(index.getStats().toString()); // now we validate the result of the search logger.info("Doing Error validation"); StaticBin1D ep = new StaticBin1D(); Iterator<OBPriorityQueueFloat<L1Float>> it1 = queryResults.iterator(); Iterator<L1Float> it2 = queries.iterator(); StaticBin1D seqTime = new StaticBin1D(); i = 0; while(it1.hasNext()){ OBPriorityQueueFloat<L1Float> qu = it1.next(); L1Float q = it2.next(); long time = System.currentTimeMillis(); float[] sortedList = index.fullMatchLite(q, false); long el = System.currentTimeMillis() - time; seqTime.add(el); logger.info("Elapsed: " + el + " " + i); OBQueryFloat<L1Float> queryObj = new OBQueryFloat<L1Float >(q, Float.MAX_VALUE, qu, null); ep.add(queryObj.approx(sortedList)); i++; } logger.info(ep.toString()); logger.info("Time per seq query: "); logger.info(seqTime.toString()); } }
To run the previous demo simply do:
java -classpath obsearch-with-dependencies.jar net.obsearch.example.vectors.VectorsDemoGHS