File-Deduplicator/file_cache.go at main · Seb-MCaw/File-Deduplicator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
package main

import (
	"fmt"
	"io"
	"unsafe"
)

// A FileCache implements efficient reading and storage of multiple possibly
// overlapping sections of a file.
//
// The desired sections of the file are first registered with RegisterRange(),
// then the data is read into memory in batches using any appropriate
// combination of ReadData() and StoreData().
//
// The stored data can then be retrieved via the RangeGetter interface.
type FileCache struct {
	// A BinaryTree of the hunks currently in the cache, sorted by their offsets
	hunks BinaryTree[fileHunk]
	// The combined amount of data (in bytes) registered with all of the
	// cache's hunks
	currentSize uint64
}

// The memory footprint (in bytes) of a file hunk, not including that of the
// data itself.
const fileHunkMemoryOverhead = uint64(unsafe.Sizeof(fileHunk{})) + BinaryTreeNodeOverhead

// Return the approximate total memory footprint in bytes the FileCache will
// have once all of the registered data is populated.
func (fc *FileCache) FullMemoryFootprint() uint64 {
	return fc.currentSize + uint64(fc.hunks.Size)*fileHunkMemoryOverhead
}

// Registers the range [startoffset, endOffset) as one that should be read
// into memory.
//
// The data it contains will not actually be populated until either ReadData()
// or StoreData() is called appropriately.
func (fc *FileCache) RegisterRange(startOffset, endOffset uint64) {
	// Find the last hunk already registered which starts at or before
	// startOffset, and the first hunk already registered which ends
	// at or after endOffset
	precHunkIndex, _ := fc.hunks.Search(
		func(h fileHunk) bool { return h.startOffset > startOffset },
		false,
	)
	precHunkIndex--
	var precHunk *fileHunk
	if precHunkIndex != -1 {
		precHunk = fc.hunks.Get(precHunkIndex)
	}
	follHunkIndex, follHunk := fc.hunks.Search(
		func(h fileHunk) bool { return h.endOffset > endOffset },
		true,
	)
	// Determine any overlap/adjacency therewith
	var precHunkOverlaps, follHunkOverlaps bool
	if precHunkIndex != -1 {
		precHunkOverlaps = (precHunk.endOffset >= startOffset)
	}
	if follHunkIndex != fc.hunks.Size {
		follHunkOverlaps = (follHunk.startOffset <= endOffset)
	}

	if precHunkIndex == follHunkIndex {
		// The requested range is already entirely contained within an existing
		// hunk, so do nothing
	} else if (follHunkIndex > precHunkIndex+1) || precHunkOverlaps || follHunkOverlaps {
		// The requested range overlaps with one or more existing hunks,
		// so replace them all with a single new one describing the combined
		// range

		// Determine the range of indices in fc.hunks to replace (as a normal
		// half-open interval), and adjust startOffset and endOffset to the
		// full extent of the new hunk.
		firstOverlapHunkIndex, lastOverlapHunkIndex := precHunkIndex+1, follHunkIndex
		if precHunkOverlaps {
			firstOverlapHunkIndex--
			startOffset = precHunk.startOffset
		}
		if follHunkOverlaps {
			lastOverlapHunkIndex++
			endOffset = follHunk.endOffset
		}
		firstOverlapHunk := fc.hunks.Get(firstOverlapHunkIndex)

		// Replace the hunk object at firstOverlapHunkIndex with the new one
		fc.currentSize -= firstOverlapHunk.endOffset - firstOverlapHunk.startOffset
		fc.currentSize += endOffset - startOffset
		firstOverlapHunk.startOffset = startOffset
		firstOverlapHunk.endOffset = endOffset
		if !precHunkOverlaps {
			// The start has changed, so any content is now invalid
			firstOverlapHunk.content = nil
		}

		// Remove any other hunks which overlap
		if lastOverlapHunkIndex > firstOverlapHunkIndex+1 {
			for i := firstOverlapHunkIndex + 1; i < lastOverlapHunkIndex; i++ {
				hunk := fc.hunks.Get(i)
				fc.currentSize -= (hunk.endOffset - hunk.startOffset)
			}
			fc.hunks.Delete(firstOverlapHunkIndex+1, lastOverlapHunkIndex)
		}
	} else { // i.e. if (follHunkIndex == precHunkIndex+1) && !precHunkOverlaps && !follHunkOverlaps
		// The requested range ovelaps with none of the existing hunks, so
		// add it as a new one.
		newHunk := fileHunk{
			startOffset: startOffset,
			endOffset:   endOffset,
		}
		fc.hunks.Insert(follHunkIndex, newHunk)
		fc.currentSize += endOffset - startOffset
	}
}

// Reads as necessary from the file to populate the cache.
//
// The file will only be read up to the specified offset stopAt (non-inclusive),
// and any parts of the cache referring to the rest of the file will remain
// unpopulated.
//
// Returns a non-nil error if a call to file.ReadAt() does.
func (fc *FileCache) ReadData(file io.ReaderAt, stopAt uint64) error {
	// Since hunks will very often be short and closely spaced (and disks
	// are generally read cluster-by-cluster) reading each individually will
	// frequently prove horribly inefficient. Instead, we use a buffer to
	// consolidate the reads for closely spaced hunks as far as possible.
	const bufCap = 65536
	buf := make([]byte, 0, bufCap)
	bufStartOff := uint64(0) // The offset corresponding to buf[0]

	// Populate each hunk in turn.
	for hunkIndex := 0; hunkIndex < fc.hunks.Size; hunkIndex++ {
		hunk := fc.hunks.Get(hunkIndex)

		// Stop once we reach stopAt.
		if hunk.startOffset >= stopAt {
			break
		}

		// Skip hunks that have already been fully populated.
		hunkLength := uint64(len(hunk.content))
		if hunkLength >= hunk.endOffset-hunk.startOffset {
			continue
		}

		// Pre-allocate the content slice if it hasn't already been
		if hunk.content == nil {
			hunk.content = make([]byte, 0, hunk.endOffset-hunk.startOffset)
		}

		// First check if the hunk content has already been read into buf.
		if (bufStartOff <= hunk.startOffset) && (bufStartOff+uint64(len(buf)) >= hunk.endOffset) {
			// If it has, just copy it from there.
			hunk.content = hunk.content[:(hunk.endOffset - hunk.startOffset)]
			copy(hunk.content, buf[(hunk.startOffset-bufStartOff):])
		} else {
			// If not, check if the next hunk(s) is/are close enough to this
			// one to warrant consolidating into one read operation.
			readStart := hunk.startOffset + hunkLength
			maxReadEnd := Min(readStart+bufCap, stopAt)
			i := hunkIndex + 1
			for (i < fc.hunks.Size) && (fc.hunks.Get(i).endOffset <= maxReadEnd) {
				i++
			}
			consolidatedHunksEndIndex := i - 1
			if consolidatedHunksEndIndex > hunkIndex {
				// It is worth consolidating multiple hunks into one read
				// command, so read the appropriate range of bytes into the
				// buffer ...
				bufStartOff = readStart
				readLength := fc.hunks.Get(consolidatedHunksEndIndex).endOffset - readStart
				buf = buf[:readLength]
				_, err := file.ReadAt(buf, int64(readStart))
				if err != nil {
					return fmt.Errorf(
						"error while populating cache: failed to read %d "+
							"bytes from offset %d",
						readLength, readStart,
					)
				}

				// ... then copy from the buffer into the current hunk.
				hunk.content = hunk.content[:(hunk.endOffset - hunk.startOffset)]
				copy(hunk.content[hunkLength:], buf)
			} else {
				// It Isn't worth consolidating multiple hunks into one read,
				// so just read straight from the file into this hunk
				readLength := Min(hunk.endOffset, stopAt) - readStart
				hunk.content = hunk.content[:(hunkLength + readLength)]
				_, err := file.ReadAt(hunk.content[hunkLength:], int64(readStart))
				if err != nil {
					return fmt.Errorf(
						"error while populating cache: failed to read %d "+
							"bytes from offset %d",
						readLength, readStart,
					)
				}
			}
		}
	}

	return nil
}

// Given a slice of data from the file (starting at the specified offset
// therein), use it to populate the cache's content as far as possible
//
// This will only be guaranteed to fully populate the parts of the cache
// referring to the range covered by the data slice if all of the file up to
// startOffset has already been populated by prior calls to either ReadData()
// or StoreData().
func (fc *FileCache) StoreData(data []byte, startOffset uint64) {
	endOffset := startOffset + uint64(len(data))

	// Do nothing if cache is empty.
	if fc.hunks.Size == 0 {
		return
	}

	// Get the index of the first hunk which this data may overlap with.
	firstIndex, _ := fc.hunks.Search(
		func(h fileHunk) bool { return h.startOffset > startOffset },
		true,
	)
	if firstIndex != 0 {
		firstIndex--
	}
	// And that of the last hunk which this data may overlap with.
	lastIndex := firstIndex
	for (lastIndex+1 < fc.hunks.Size) && (fc.hunks.Get(lastIndex).endOffset < endOffset) {
		lastIndex++
	}

	// Process the data for each potentially overlapping hunk.
	for i := firstIndex; i <= lastIndex; i++ {
		fc.hunks.Get(i).StoreData(data, startOffset)
	}
}

// Implement RangeGetter interface.
//
// Return any bytes from the range [startOffset, startOffset+length) which
// are currently present in the cache.
//
// The first return value is a []byte containing data from the specified range,
// and the second return value is the offset within the file the start of this
// slice refers to.
//
// When multiple disjoint parts of the desired range are present in the cache,
// only the first part is returned.
//
// The returned slice is a window into the cache, so should be treated as
// read-only.
func (fc *FileCache) GetRange(startOffset, length uint64) ([]byte, uint64) {
	// Find the last hunk which starts at or before startOffset, and return
	// the appropriate portion thereof if it overlaps with the desired range.
	hunkIndex, _ := fc.hunks.Search(
		func(h fileHunk) bool { return h.startOffset > startOffset },
		false,
	)
	hunkIndex--
	if hunkIndex != -1 {
		hunk := fc.hunks.Get(hunkIndex)
		hunkLen := uint64(len(hunk.content))
		if (hunk.startOffset + hunkLen) > startOffset {
			startIdx := startOffset - hunk.startOffset
			if startIdx+length < hunkLen {
				return hunk.content[startIdx : startIdx+length], startOffset
			} else {
				return hunk.content[startIdx:hunkLen], startOffset
			}
		}
	}

	// If not, then see if the next hunk (the first which starts after
	// startOffset) gives any overlap.
	hunkIndex++
	if hunkIndex != fc.hunks.Size {
		hunk := fc.hunks.Get(hunkIndex)
		if hunk.startOffset < startOffset+length {
			hunkLen := uint64(len(hunk.content))
			if (hunk.startOffset + hunkLen) > (startOffset + length) {
				readLen := startOffset + length - hunk.startOffset
				return hunk.content[:readLen], hunk.startOffset
			} else {
				return hunk.content, hunk.startOffset
			}
		}
	}

	// If neither overlap, no part of the requested range is present
	return nil, startOffset
}

// Implement RangeGetter interface
func (fc *FileCache) RGLock() {
	// Since a FileCache is not expected to be threadsafe, do nothing.
}
func (fc *FileCache) RGRelease() {
	// Since a FileCache is not expected to be threadsafe, do nothing.
}

// Return any bytes from the range [startOffset, startOffset+length) which
// are currently present in the cache.
//
// Identical to GetRange() except returns the specified range of bytes only
// if it can be retrieved in its entirety, otherwise returning nil.
//
// The returned slice is a window into the cache, so should be treated as
// read-only.
func (fc *FileCache) GetFullRange(startOffset, length uint64) []byte {
	s, _ := fc.GetRange(startOffset, length)
	if uint64(len(s)) == length {
		return s
	} else {
		return nil
	}
}

// A fileHunk describes one of the disjoint byte strings stored by a FileCache
type fileHunk struct {
	// A slice containing the contents of the file starting from startOffset.
	// It can have any length up to (endOffset - startOffset).
	content []byte
	// The offset within the file at which the hunk starts.
	startOffset uint64
	// The offset of the first byte which should not be appended to the hunk
	// when data is being read in.
	// This is only here to specify when to stop reading; the actual offset
	// of the hunk's current end is always given by startOffset + len(content).
	endOffset uint64
}

// Given a slice of data from the file (starting at the specified offset
// therein), use it to populate the hunk's content if possible
//
// Note that if startOffset is partway through the hunk, the hunk will only
// be populated if the part of the hunk before startOffset is already fully
// populated.
func (h *fileHunk) StoreData(data []byte, startOffset uint64) {
	nextOffsetRequired := h.startOffset + uint64(len(h.content))
	if startOffset <= nextOffsetRequired {
		startIndex := nextOffsetRequired - startOffset
		if startIndex < uint64(len(data)) {
			if h.content == nil {
				// Pre-allocate the content slice if it hasn't already been
				h.content = make([]byte, 0, h.endOffset-h.startOffset)
			}
			endIndex := uint64(len(data))
			if (startOffset + uint64(len(data))) > h.endOffset {
				endIndex = h.endOffset - startOffset
			}
			h.content = append(h.content, data[startIndex:endIndex]...)
		}
	}
}