View Javadoc

1   package net.obsearch.index.ghs;
2   
3   import it.unimi.dsi.io.InputBitStream;
4   import it.unimi.dsi.io.OutputBitStream;
5   
6   import java.io.File;
7   import java.io.FileInputStream;
8   import java.io.FileOutputStream;
9   import java.io.IOException;
10  import java.util.ArrayList;
11  import java.util.Arrays;
12  import java.util.Collections;
13  import java.util.List;
14  
15  import net.obsearch.asserts.OBAsserts;
16  import net.obsearch.exception.OBException;
17  import net.obsearch.result.OBResultInvertedByte;
18  
19  /*
20   OBSearch: a distributed similarity search engine This project is to
21   similarity search what 'bit-torrent' is to downloads. 
22   Copyright (C) 2009 Arnoldo Jose Muller Molina
23  
24   This program is free software: you can redistribute it and/or modify
25   it under the terms of the GNU General Public License as published by
26   the Free Software Foundation, either version 3 of the License, or
27   (at your option) any later version.
28  
29   This program is distributed in the hope that it will be useful,
30   but WITHOUT ANY WARRANTY; without even the implied warranty of
31   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
32   GNU General Public License for more details.
33  
34   You should have received a copy of the GNU General Public License
35   along with this program.  If not, see <http://www.gnu.org/licenses/>.
36   */
37  
38  /**
39   * CompressedBitSet64 stores bits in a byte array. The bit set must be first
40   * created (stored in a temporary file) And then the bytes will be loaded into
41   * memory. The compressed bit set works on longs and it allows sequential k-nn
42   * searches of longs with the hamming distance. Insertions must be done in
43   * ascending order. The main assumption is that compression will allow small
44   * indexes that will be stored in memory.
45   * 
46   * @author Arnoldo Jose Muller Molina
47   */
48  
49  public class CompressedBitSet64 {
50  
51  	protected byte[] data;
52  	private File f;
53  	private OutputBitStream out;
54  	protected int count = 0;
55  	protected long first;
56  	private long previous;
57  	private boolean commit;
58  
59  	/**
60  	 * Create a new compressed bit set.
61  	 * 
62  	 * @throws IOException
63  	 */
64  	public CompressedBitSet64() throws OBException {
65  		try {
66  			f = File.createTempFile("OBSearch", "bitset");
67  			FileOutputStream o = new FileOutputStream(f);
68  			out = new OutputBitStream(o);
69  		} catch (IOException e) {
70  			throw new OBException(e);
71  		}
72  		commit = false;
73  	}
74  
75  	/**
76  	 * Add the ith bit to this bitset.
77  	 * 
78  	 * @param bit
79  	 * @throws OBException
80  	 */
81  	public void add(long bit) throws OBException {
82  		OBAsserts.chkAssert(!commit, "Cannot add when commited");
83  		if (count == 0) {
84  			first = bit;
85  		} else {
86  			OBAsserts.chkAssert(previous < bit,
87  					"Elements must be ordered from lower to higher values");
88  			long d = bit - previous;
89  			try {
90  				write(d, out);
91  			} catch (IOException e) {
92  				throw new OBException(e);
93  			}
94  		}
95  		previous = bit;
96  		count++;
97  	}
98  
99  	/**
100 	 * We will stop adding values and now we will use the bit set for search.
101 	 * 
102 	 * @throws OBException
103 	 */
104 	public void commit() throws OBException {
105 		OBAsserts.chkAssert(!commit, "Cannot commit when commited");
106 		try {
107 			out.close();
108 			long byteSize = f.length();
109 			OBAsserts.chkAssert(byteSize <= Integer.MAX_VALUE,
110 					"Exceeded allowed index size");
111 			data = new byte[(int) byteSize];
112 			FileInputStream fi = new FileInputStream(f);
113 			fi.read(data);
114 			fi.close();
115 			f.delete();
116 		} catch (IOException e) {
117 			throw new OBException(e);
118 		}
119 	}
120 
121 	public long getBytesSize() {
122 		return data.length;
123 	}
124 
125 	/**
126 	 * Hamming distance used for searching.
127 	 * 
128 	 * @param a
129 	 * @param b
130 	 * @return
131 	 */
132 	public final int bucketDistance(long a, long b) {
133 		return Long.bitCount(a ^ b);
134 	}
135 
136 	/**
137 	 * Search the maxF closest buckets by hamming distance for the given query
138 	 * 
139 	 * @param query
140 	 *            The query we will use
141 	 * @param maxF
142 	 *            The number of objects that will be searched.
143 	 * @param m
144 	 *            the number of bits == max distance expected.
145 	 * @return The closest objects to the given query.
146 	 * @throws InstantiationException
147 	 * @throws IllegalAccessException
148 	 * @throws OBException
149 	 */
150 	public long[] searchBuckets(long query, int maxF, int m)
151 			throws OBException {
152 
153 		InputBitStream delta = new InputBitStream(data);
154 		long[] result = new long[maxF];
155 		FastPriorityQueueLong l = new FastPriorityQueueLong(m, maxF);
156 
157 		// do the first element.
158 		int distance = bucketDistance(query, first);
159 		// previous bucket id.
160 		long prev = first;
161 		l.add(first, distance);
162 		// do the rest
163 		try {
164 			int i = 1;
165 			while (i < count) {
166 				long object = read(delta) + prev;
167 				distance = bucketDistance(query, object);
168 				l.add(object, distance);
169 				prev = object;
170 				i++;
171 			}
172 		} catch (IOException e) {
173 			throw new OBException(e);
174 		}
175 		return l.get();
176 	}
177 	
178 	/**
179 	 * Search the entire bitset. This should be done with datasets
180 	 * that can fit in memory and only to generate statistics
181 	 * on the data.
182 	 * @param query query to search
183 	 * @return
184 	 */
185 	public List<OBResultInvertedByte<Long>> searchFull(long query) throws OBException {
186 		List<OBResultInvertedByte<Long>> result = new ArrayList<OBResultInvertedByte<Long>>(size());
187 		InputBitStream delta = new InputBitStream(data);
188 		// do the first element.
189 		int distance = bucketDistance(query, first);
190 		result.add(new OBResultInvertedByte<Long>(first, first, (byte)distance));
191 		long prev = first;
192 		int i = 1;
193 		try {
194 		while (i < count) {
195 			long object;
196 			
197 				object = read(delta) + prev;
198 			
199 			distance = bucketDistance(query, object);
200 			assert distance <= Byte.MAX_VALUE;
201 			result.add(new OBResultInvertedByte<Long>(object, object, (byte)distance));
202 			prev = object;
203 			i++;
204 		}
205 		Collections.sort(result);
206 		return result;
207 		} catch (IOException e) {
208 			throw new OBException(e);
209 		}
210 	}
211 	
212 	public int size(){
213 		return count;
214 	}
215 	
216 	/**
217 	 * Return all the buckets just for debugging purposes.
218 	 * @return
219 	 * @throws IOException 
220 	 */
221 	protected long[] getAll() throws IOException{
222 		long[] result = new long[count];
223 		long prev = first;
224 		int i = 0;
225 		result[i] = prev;
226 		i++;
227 		InputBitStream delta = new InputBitStream(data);
228 		while(i < result.length){
229 			result[i] = read(delta) + prev;
230 			prev = result[i];
231 			i++;
232 		}
233 		return result;
234 	}
235 	
236 	private void write(long object, OutputBitStream out) throws IOException{
237 		out.writeLongDelta(object);
238 	}
239 	
240 	protected long read(InputBitStream in) throws IOException{
241 		return in.readLongDelta();
242 	}
243 
244 }