View Javadoc

1   package net.obsearch.index.utils;
2   
3   import hep.aida.bin.StaticBin1D;
4   
5   import java.io.BufferedReader;
6   import java.io.File;
7   import java.io.FileReader;
8   import java.io.IOException;
9   import java.util.ArrayList;
10  import java.util.Collections;
11  import java.util.HashSet;
12  import java.util.Iterator;
13  import java.util.LinkedList;
14  import java.util.List;
15  import java.util.Set;
16  
17  import net.obsearch.AbstractOBResult;
18  import net.obsearch.ApproxIndexShort;
19  import net.obsearch.asserts.OBAsserts;
20  import net.obsearch.exception.OBException;
21  import net.obsearch.exception.OBStorageException;
22  import net.obsearch.index.IndexShort;
23  import net.obsearch.ob.OBShort;
24  import net.obsearch.query.OBQueryShort;
25  import net.obsearch.result.OBPriorityQueueShort;
26  import net.obsearch.result.OBResultShort;
27  import net.obsearch.stats.Statistics;
28  
29  import org.apache.log4j.Logger;
30  import static org.junit.Assert.assertEquals;
31  import static org.junit.Assert.assertFalse;
32  import static org.junit.Assert.assertTrue;
33  
34  public class IndexSmokeTUtilApprox<O extends OBShort> extends IndexSmokeTUtil<O> {
35  	
36  	public IndexSmokeTUtilApprox(OBFactory<O> factory) throws IOException {
37  		super(factory);
38  	}
39  	
40  	/**
41       * Logger.
42       */
43      private static transient final Logger logger = Logger
44              .getLogger(IndexSmokeTUtilApprox.class);
45  
46  	public void search(IndexShort<O> index, short range, short k)
47  			throws Exception {
48  		// assertEquals(index.aDB.count(), index.bDB.count());
49  		// assertEquals(index.aDB.count(), index.bDB.count());
50  		// index.stats();
51  		OBAsserts.chkAssert(index.databaseSize() <= Integer.MAX_VALUE, "Cannot test with more than 2^32 items");
52  		index.resetStats();
53  		// it is time to Search
54  		int querySize = 1000; // amount of elements to read from the query
55  		String re = null;
56  		logger.info("Matching begins...");
57  		File query = new File(testProperties.getProperty("test.query.input"));
58  		File dbFolder = new File(testProperties.getProperty("test.db.path"));
59  		BufferedReader r = new BufferedReader(new FileReader(query));
60  		List<OBPriorityQueueShort<O>> result = new LinkedList<OBPriorityQueueShort<O>>();
61  		re = r.readLine();
62  		int i = 0;
63  		long realIndex = index.databaseSize();
64  
65  		while (re != null) {
66  			String line = parseLine(re);
67  			if (line != null) {
68  				OBPriorityQueueShort<O> x = new OBPriorityQueueShort<O>(k);
69  				if (i % 100 == 0) {
70  					logger.info("Matching " + i);
71  				}
72  
73  				O s = factory.create(line);
74  				if (factory.shouldProcess(s)) {					
75  					index.searchOB(s, range, x);
76  					result.add(x);
77  					i++;
78  				}
79  			}
80  			if (i == querySize) {
81  				logger.warn("Finishing test at i : " + i);
82  				break;
83  			}
84  			re = r.readLine();
85  		}
86  
87  		logger.info(index.getStats().toString());
88  
89  		int maxQuery = i;
90  		// logger.info("Matching ends... Stats follow:");
91  		// index.stats();
92  
93  		// now we compare the results we got with the sequential search
94  		Iterator<OBPriorityQueueShort<O>> it = result.iterator();
95  		r.close();
96  		r = new BufferedReader(new FileReader(query));
97  		re = r.readLine();
98  		i = 0;
99  		StaticBin1D stats = new StaticBin1D();
100 		int emptyResults = 0;
101 		int badResults = 0;
102 		int zeroResults = 0;
103 		IntegerHolder badCount = new IntegerHolder(0);
104 		Statistics st = new Statistics();
105 		StaticBin1D recall = new StaticBin1D();
106 		while (re != null) {
107 			String line = parseLine(re);
108 			if (line != null) {
109 				if (i % 300 == 0) {
110 					logger.info("Matching " + i + " of " + maxQuery);
111 				}
112 				O s = factory.create(line);
113 				if (factory.shouldProcess(s)) {
114 					
115 					ArrayList<OBResultShort<O>> x2 = new ArrayList<OBResultShort<O>>((int)index.databaseSize());
116 
117 					searchSequential(realIndex, s, x2, index, range);
118 					OBPriorityQueueShort<O> x1 = it.next();
119 					// assertEquals("Error in query line: " + i + " slice: "
120 					// + line, x2, x1);
121 					st.addExtraStats("ReturnedSize", x2.size());
122 					if(isApproxZero(x1,x2,range)){
123 						//logger.info("Error in query line: " + i + " " + index.debug(s) + "\n slice: "
124 	                    //        + line + " " + debug(x2.getSortedElements().subList(0, Math.min(3, x2.getSize())).iterator(),index ) + "\n");
125 						emptyResults++;
126 					}else{
127 						double ep = ep(x1,x2,index);
128 						if(ok(x1,x2,range)){
129 							assertTrue(ep == 0);
130 						}
131 						stats.add(ep);
132 						if(index instanceof ApproxIndexShort){
133 							OBQueryShort<O> q = new OBQueryShort<O>(null, x1);
134 							List<AbstractOBResult<O>> seq = new LinkedList<AbstractOBResult<O>>();
135 							for(OBResultShort<O> o : x2){
136 								seq.add(o);
137 							}
138 							double anotherEp = q.ep(seq);
139 							assertTrue("index ep " + ep + " priority ep: " + anotherEp, ep == anotherEp);
140 							
141 						}
142 					}
143 					if(!ok(x1,x2, range)){
144 						badResults++;
145 					}
146 					
147 					if(zeroResults(x2,range)){
148 						zeroResults++;
149 					}
150 					recall.add(recall(x1,x2,k,range));
151 					i++;
152 				}
153 
154 			}
155 			if (i == querySize) {
156 				logger.warn("Finishing test at i : " + i);
157 				break;
158 			}
159 			
160 			re = r.readLine();
161 		}
162 		
163 		r.close();
164 		logger.info("Finished  CompoundError calculation: ");
165 		logger.info("Returned size" + st);
166 		logger.info(StatsUtil.prettyPrintStats("CompoundError", stats));
167 		assertFalse(it.hasNext());
168 		logger.info("Recall: " + recall.toString());
169 		logger.info("Zero queries: " + emptyResults);
170 		logger.info("Bad results: " + badResults);
171 		logger.info("Zero (real) queries : " + zeroResults);
172 	}
173 	
174 	/**
175 	 * 
176 	 * @param db database
177 	 * @param range range query
178 	 * @return true if the results are zero.
179 	 */
180 	public boolean zeroResults( List<OBResultShort<O>> db, short range){
181 		return db.get(0).getDistance() > range;
182 	}
183 	
184 	/**
185 	 * If x1 (query) returned zero results but db had results, this function returns
186 	 * true. Otherwise false.
187 	 * @param x1
188 	 * @param db
189 	 * @param range
190 	 * @return
191 	 */
192 	public boolean isApproxZero(OBPriorityQueueShort<O> x1, List<OBResultShort<O>> db, short range){
193 		return x1.getSize() == 0 && (db.size() != 0 && db.get(0).getDistance() <= range);
194 	}
195 	
196 	public boolean ok(OBPriorityQueueShort<O> x1, List<OBResultShort<O>> db, short range){
197 		
198 		// if the db is empty, then we don't have to worry about anything else.
199 		if(zeroResults(db,range)){
200 			assert x1.getSize() == 0;
201 			return true;
202 		}
203 		
204 		List<OBResultShort<O>> query = x1.getSortedElements();
205 		
206 		if(query.size() == 0 && db.size() != 0){
207 			return false;
208 		}
209 		assert query.size() <= db.size() : "Query: " + query + " db: " + db;
210 		Iterator<OBResultShort<O>> it = db.iterator();
211 		for(OBResultShort<O> r : query){
212 			assert it.hasNext();
213 			if(it.hasNext() && r.compareTo(it.next()) != 0){
214 				return false;
215 			}
216 		}
217 		return true;
218 	}
219 	
220 	public double recall(OBPriorityQueueShort<O> x1, List<OBResultShort<O>> db, int k, short range){
221 		if(zeroResults(db,range)){
222 			assert x1.getSize() == 0;
223 			return 1;
224 		}
225 		int hits = 0;
226 		Set<OBResultShort<O>> s = new HashSet<OBResultShort<O>>();
227 		for(OBResultShort<O> r : x1.getSortedElements()){			
228 			int i = 0;
229 			for(OBResultShort<O> d : db){
230 				if(i == k){
231 					break;
232 				}
233 				if(! s.contains(d) &&d.compareTo(r) == 0){
234 					s.add(d);
235 					hits++;
236 					break;
237 				}
238 				i++;
239 			}
240 		}
241 		return (double)hits / (double)k;
242 	}
243 	
244 	/**
245      * Sequential search.
246      * @param max
247      *                Search all the ids in the database until max
248      * @param o
249      *                The object to search
250      * @param result
251      *                The queue were the results are stored
252      * @param index
253      *                the index to search
254      * @param range
255      *                The range to employ
256 	 * @throws InstantiationException 
257 	 * @throws IllegalAccessException 
258      * @throws Exception
259      *                 If something goes really bad.
260      */
261     public  void searchSequential(long max, O o,
262             ArrayList < OBResultShort<O> > result,
263             IndexShort < O > index, short range) throws OBException, IllegalAccessException, InstantiationException {
264         int i = 0;
265         while (i < max) {
266             O obj = index.getObject(i);
267             short res = o.distance(obj);
268             //if (res <= range) {
269                 result.add( new OBResultShort<O>(obj, i, res));
270             //}
271             i++;
272         }
273         Collections.sort(result);
274         Collections.reverse(result);
275     }
276 	
277 	public double ep(OBPriorityQueueShort<O> x1, List<OBResultShort<O>> db, IndexShort < O > index) throws OBStorageException{
278 		List<OBResultShort<O>> query = x1.getSortedElements();
279 		int i = 0;
280 		int result = 0;
281 		Set<OBResultShort<O>> s = new HashSet<OBResultShort<O>>();
282 		for(OBResultShort<O> r : query){
283 			// find the position in the db. 
284 			int cx = 0;
285 			for(OBResultShort<O> c : db){
286 				if(! s.contains(c) &&c.compareTo(r) == 0){
287 					s.add(c);
288 					result += cx - i;
289 					break;
290 				}
291 				cx++;
292 			}
293 			i++;
294 		}
295 		if(query.size() == 0){
296 			return 0;
297 		}else{
298 			double res = ((double)result)/ ((double)(query.size() * db.size()));
299 			return res;
300 		}
301 	}
302 
303 }