root / pithos / lib / hashfiler / blocker.py @ 5bd53e3b
History | View | Annotate | Download (5.3 kB)
1 |
#!/usr/bin/env python
|
---|---|
2 |
|
3 |
from sqlite3 import connect, OperationalError |
4 |
from os import chdir, makedirs, fsync, SEEK_CUR, SEEK_SET, unlink |
5 |
from os.path import isdir, realpath, exists, join |
6 |
from hashlib import new as newhasher |
7 |
from binascii import hexlify, unhexlify |
8 |
from time import time |
9 |
|
10 |
from pithos.lib.hashfiler.context_file import ContextFile |
11 |
|
12 |
|
13 |
class Blocker(object): |
14 |
"""Blocker.
|
15 |
Required contstructor parameters: blocksize, blockpath, hashtype.
|
16 |
"""
|
17 |
|
18 |
blocksize = None
|
19 |
blockpath = None
|
20 |
hashtype = None
|
21 |
|
22 |
def __init__(self, **params): |
23 |
blocksize = params['blocksize']
|
24 |
blockpath = params['blockpath']
|
25 |
blockpath = realpath(blockpath) |
26 |
if not isdir(blockpath): |
27 |
if not exists(blockpath): |
28 |
makedirs(blockpath) |
29 |
else:
|
30 |
raise ValueError("blockpath '%s' is not a directory" % (blockpath,)) |
31 |
|
32 |
hashtype = params['hashtype']
|
33 |
try:
|
34 |
hasher = newhasher(hashtype) |
35 |
except ValueError: |
36 |
msg = "hashtype '%s' is not available from hashlib"
|
37 |
raise ValueError(msg % (hashtype,)) |
38 |
|
39 |
hasher.update("")
|
40 |
emptyhash = hasher.digest() |
41 |
|
42 |
self.blocksize = blocksize
|
43 |
self.blockpath = blockpath
|
44 |
self.hashtype = hashtype
|
45 |
self.hashlen = len(emptyhash) |
46 |
self.emptyhash = emptyhash
|
47 |
|
48 |
def get_rear_block(self, blkhash, create=0): |
49 |
name = join(self.blockpath, hexlify(blkhash))
|
50 |
return ContextFile(name, create)
|
51 |
|
52 |
def check_rear_block(self, blkhash): |
53 |
name = join(self.blockpath, hexlify(blkhash))
|
54 |
return exists(name)
|
55 |
|
56 |
def block_hash(self, data): |
57 |
"""Hash a block of data"""
|
58 |
hasher = newhasher(self.hashtype)
|
59 |
hasher.update(data.rstrip('\x00'))
|
60 |
return hasher.digest()
|
61 |
|
62 |
def block_ping(self, hashes): |
63 |
"""Check hashes for existence and
|
64 |
return those missing from block storage.
|
65 |
"""
|
66 |
missing = [] |
67 |
append = missing.append |
68 |
for i, h in enumerate(hashes): |
69 |
if not self.check_rear_block(h): |
70 |
append(i) |
71 |
return missing
|
72 |
|
73 |
def block_retr(self, hashes): |
74 |
"""Retrieve blocks from storage by their hashes."""
|
75 |
blocksize = self.blocksize
|
76 |
blocks = [] |
77 |
append = blocks.append |
78 |
block = None
|
79 |
|
80 |
for h in hashes: |
81 |
with self.get_rear_block(h, 0) as rbl: |
82 |
if not rbl: |
83 |
break
|
84 |
for block in rbl.sync_read_chunks(blocksize, 1, 0): |
85 |
break # there should be just one block there |
86 |
if not block: |
87 |
break
|
88 |
append(block) |
89 |
|
90 |
return blocks
|
91 |
|
92 |
def block_stor(self, blocklist): |
93 |
"""Store a bunch of blocks and return (hashes, missing).
|
94 |
Hashes is a list of the hashes of the blocks,
|
95 |
missing is a list of indices in that list indicating
|
96 |
which blocks were missing from the store.
|
97 |
"""
|
98 |
block_hash = self.block_hash
|
99 |
hashlist = [block_hash(b) for b in blocklist] |
100 |
mf = None
|
101 |
missing = self.block_ping(hashlist)
|
102 |
for i in missing: |
103 |
with self.get_rear_block(hashlist[i], 1) as rbl: |
104 |
rbl.sync_write(blocklist[i]) #XXX: verify?
|
105 |
|
106 |
return hashlist, missing
|
107 |
|
108 |
def block_delta(self, blkhash, offdata=()): |
109 |
"""Construct and store a new block from a given block
|
110 |
and a list of (offset, data) 'patches'. Return:
|
111 |
(the hash of the new block, if the block already existed)
|
112 |
"""
|
113 |
if not offdata: |
114 |
return None, None |
115 |
|
116 |
blocksize = self.blocksize
|
117 |
block = self.block_retr((blkhash,))
|
118 |
if not block: |
119 |
return None, None |
120 |
|
121 |
block = block[0]
|
122 |
newblock = ''
|
123 |
idx = 0
|
124 |
size = 0
|
125 |
trunc = 0
|
126 |
for off, data in offdata: |
127 |
if not data: |
128 |
trunc = 1
|
129 |
break
|
130 |
newblock += block[idx:off] + data |
131 |
size += off - idx + len(data)
|
132 |
if size >= blocksize:
|
133 |
break
|
134 |
off = size |
135 |
|
136 |
if not trunc: |
137 |
newblock += block[size:len(block)]
|
138 |
|
139 |
h, a = self.block_stor((newblock,))
|
140 |
return h[0], 1 if a else 0 |
141 |
|
142 |
def block_hash_file(self, openfile): |
143 |
"""Return the list of hashes (hashes map)
|
144 |
for the blocks in a buffered file.
|
145 |
Helper method, does not affect store.
|
146 |
"""
|
147 |
hashes = [] |
148 |
append = hashes.append |
149 |
block_hash = self.block_hash
|
150 |
|
151 |
for block in file_sync_read_chunks(openfile, self.blocksize, 1, 0): |
152 |
append(block_hash(block)) |
153 |
|
154 |
return hashes
|
155 |
|
156 |
def block_stor_file(self, openfile): |
157 |
"""Read blocks from buffered file object and store them. Return:
|
158 |
(bytes read, list of hashes, list of hashes that were missing)
|
159 |
"""
|
160 |
blocksize = self.blocksize
|
161 |
block_stor = self.block_stor
|
162 |
hashlist = [] |
163 |
hextend = hashlist.extend |
164 |
storedlist = [] |
165 |
sextend = storedlist.extend |
166 |
lastsize = 0
|
167 |
|
168 |
for block in file_sync_read_chunks(openfile, blocksize, 1, 0): |
169 |
hl, sl = block_stor((block,)) |
170 |
hextend(hl) |
171 |
sextend(sl) |
172 |
lastsize = len(block)
|
173 |
|
174 |
size = (len(hashlist) -1) * blocksize + lastsize if hashlist else 0 |
175 |
return size, hashlist, storedlist
|
176 |
|