Fix for directory renames
[pithos-ms-client] / trunk / Pithos.Core / Agents / TupleBuilder.cs
1 using System;\r
2 using System.Collections.Concurrent;\r
3 using System.Collections.Generic;\r
4 using System.Diagnostics;\r
5 using System.IO;\r
6 using System.Linq;\r
7 using System.Reflection;\r
8 using System.Threading;\r
9 using System.Threading.Tasks;\r
10 using Pithos.Interfaces;\r
11 using Pithos.Network;\r
12 using log4net;\r
13 \r
14 namespace Pithos.Core.Agents\r
15 {\r
16     /// <summary>\r
17     /// Creates a set of state tuples from the local files, cloud files and state database entries\r
18     /// </summary>\r
19     class TupleBuilder\r
20     {\r
21         private static readonly ILog Log = LogManager.GetLogger(MethodBase.GetCurrentMethod().DeclaringType);\r
22 \r
23         private readonly string _emptyGuid = Guid.Empty.ToString();\r
24 \r
25         private readonly CancellationToken _token;\r
26 \r
27         private readonly IStatusKeeper _statusKeeper;\r
28         private readonly IStatusNotification _statusNotification;\r
29         private readonly IPithosSettings _settings;\r
30 \r
31         public TupleBuilder(CancellationToken token, IStatusKeeper statusKeeper, IStatusNotification statusNotification, IPithosSettings settings)\r
32         {\r
33             _token = token;\r
34             _statusKeeper = statusKeeper;\r
35             _statusNotification = statusNotification;\r
36             _settings = settings;\r
37         }\r
38 \r
39         /// <summary>\r
40         /// Creates a list of Sync tuples by merging local and cloud files with the locally stored state\r
41         /// </summary>\r
42         /// <param name="infos"></param>\r
43         /// <param name="files"></param>\r
44         /// <param name="states"></param>\r
45         /// <param name="moves"></param>\r
46         /// <returns></returns>\r
47         public IEnumerable<StateTuple> MergeSources(IEnumerable<Tuple<string, ObjectInfo>> infos, IEnumerable<FileSystemInfo> files, List<FileState> states, ConcurrentDictionary<string, MovedEventArgs> moves)\r
48         {\r
49             var tuplesByPath = new Dictionary<string, StateTuple>();\r
50             //Fill the tuples with the local files\r
51             CreateTuplesFromFiles(files, tuplesByPath);\r
52             \r
53             //Merge the file tuples with the local states, creating new tuples for states that have no matching files\r
54             MergeLocalStates(states, tuplesByPath);\r
55 \r
56             MergeCloudFiles(infos, tuplesByPath);\r
57 \r
58             DetectLocalMoves(tuplesByPath);\r
59 \r
60             var tuples = tuplesByPath.Values;\r
61             //Sync algorithm fallout: There are multiple ways we can reach a situation where a state without a checksum exists\r
62             //1: The application stopped/crashed while downloading a file. The file's entry was created when the download started. When the application restarts,\r
63             //  it finds no local file, a server file and a null state -> C: NULL L: NULL but exists, S: Some\r
64             //  It can be fixed by NOT creating a local state if the file doesn't already exist, or adding extra info to mark this as a result of an upload\r
65             //2: A new file is added but the app stops/crashes after uploading finishes but before the entry gets updated and the user deletes the file. The file's entry was created. When the app restarts,\r
66             //  it finds no local file, a server file and a null state -> C: NULL L: NULL but exists, S: Some\r
67             //  \r
68 \r
69             var brokenTuples =( from tuple in tuples\r
70                                where tuple.FileState != null && tuple.FileState.Checksum == null\r
71                                     && tuple.ObjectInfo != null && (tuple.FileInfo == null || !tuple.FileInfo.Exists)\r
72                                select tuple).ToList();\r
73 \r
74 \r
75             var actualTuples = tuples.Except(brokenTuples).ToList();\r
76             Debug.Assert(actualTuples.All(t => t.HashesValid()));\r
77 \r
78             foreach (var tuple in brokenTuples)\r
79             {\r
80                 _statusKeeper.SetFileState(tuple.FileState.FilePath,\r
81                     FileStatus.Conflict, FileOverlayStatus.Conflict, "FileState without checksum encountered for server object missing from disk");\r
82             }\r
83 \r
84             return actualTuples;\r
85         }\r
86 \r
87 \r
88         \r
89         /// <summary>\r
90         /// \r
91         /// </summary>\r
92         /// <param name="tuplesByPath"></param>\r
93         /// <remarks>\r
94         /// Local moves will appear as two tuples with the same hashes in different locations:\r
95         /// - The FROM tuple will have a NULL C, and a filled L and S value\r
96         /// - The TO tuple will only have a C hash value equal to FROM's and empty L,S\r
97         /// The FROM tuple should be removed and the TO tuple should be updated to reflect that it's a MOVE operation.\r
98         /// This should happen only if there are EXACTLY two tuples with the same hash\r
99         /// </remarks>\r
100         private void DetectLocalMoves(Dictionary<string, StateTuple> tuplesByPath)\r
101         {\r
102             Func<StateTuple, bool> isNew =  t =>  t.C != null\r
103                 && (t.L == null || t.NullSafe(tp => tp.FileState).NullSafe( s => !s.FilePath.Equals(t.FilePath)))\r
104                 && t.S == null;\r
105             //Newly created fiels are candidate TOs\r
106             var fileCreates = tuplesByPath.Values.Where(t=> t.FileInfo is FileInfo && isNew(t));\r
107             var folderCreates = tuplesByPath.Values.Where(t => t.FileInfo is DirectoryInfo && isNew(t));\r
108             //Newly deleted files are candidate FROMs\r
109             var fileDeletes = tuplesByPath.Values.Where(t =>t.NullSafe(t1=>t1.FileState).NullSafe(s=>!s.IsFolder) &&  t.C == null && t.L != null && t.L==t.S);\r
110             var folderDeletes = tuplesByPath.Values.Where(t => t.NullSafe(t1 => t1.FileState).NullSafe(s => s.IsFolder) && t.C == null && t.L != null && t.L == t.S);\r
111 \r
112             var moves = (from tuple in fileCreates\r
113                         let froms = fileDeletes.Where(d => d.L == tuple.C)\r
114                         where froms.Count() == 1\r
115                         select new {To = tuple, From = froms.Single()}).ToList();\r
116 \r
117             if (!moves.Any())\r
118                 return;\r
119 \r
120             foreach (var move in moves)\r
121             {\r
122                 //Remove the old tuple\r
123                 var fromTuple = move.From;\r
124                 var toTuple = move.To;\r
125 \r
126                 ReplaceTupleForMove(tuplesByPath, fromTuple, toTuple);\r
127                 //Detetct folder moves from moved files\r
128 \r
129 \r
130                 //Can't create ObjectInfo, FileState for a directory if we don't store directories in the database\r
131                 //Find a folderCreate that matches the TO and a folderDelete that matches the FROM\r
132                 var toFolder = folderCreates.SingleOrDefault(fd => move.To.FilePath.IsAtOrDirectlyBelow(fd.FilePath));\r
133                 var fromFolder = folderDeletes.SingleOrDefault(fd => move.From.FilePath.IsAtOrDirectlyBelow(fd.FilePath));\r
134                 if (fromFolder !=null && toFolder!= null)\r
135                     ReplaceTupleForMove(tuplesByPath,fromFolder,toFolder);\r
136                 //Folders may not be stored in states \r
137 /*\r
138                 if (toFolder != null)\r
139                 {\r
140                     var fromPath = Path.GetDirectoryName(fromTuple.FilePath);\r
141                     tuplesByPath.Remove(fromPath);\r
142                     toFolder.OldFullPath = fromPath;\r
143                     toFolder.NewFullPath = toFolder.FilePath;\r
144                     toFolder.OldChecksum = toFolder.L;\r
145                     toFolder.FileState = fromTuple.FileState;\r
146                     toFolder.ObjectInfo = fromTuple.ObjectInfo;\r
147 \r
148               }\r
149 */\r
150 \r
151             }\r
152 \r
153         }\r
154 \r
155         private static void ReplaceTupleForMove(Dictionary<string, StateTuple> tuplesByPath, StateTuple fromTuple, StateTuple toTuple)\r
156         {\r
157             tuplesByPath.Remove(fromTuple.FilePath);\r
158             //Update the new tuple with information needed for the Move\r
159             toTuple.OldFullPath = fromTuple.FilePath;\r
160             toTuple.NewFullPath = toTuple.FilePath;\r
161             toTuple.OldChecksum = fromTuple.L;\r
162             toTuple.FileState = fromTuple.FileState;\r
163             toTuple.ObjectInfo = fromTuple.ObjectInfo;\r
164         }\r
165 \r
166         private static void CreateTuplesFromFiles(IEnumerable<FileSystemInfo> files, Dictionary<string, StateTuple> tuplesByPath)\r
167         {\r
168             foreach (var info in files)\r
169             {\r
170                 var tuple = new StateTuple(info);\r
171                 //Is this the target of a move event?\r
172                 /*var moveArg =\r
173                     moves.Values.FirstOrDefault(\r
174                         arg => info.FullName.Equals(arg.FullPath, StringComparison.InvariantCultureIgnoreCase)\r
175                                || info.FullName.IsAtOrBelow(arg.FullPath));\r
176                 if (moveArg != null)\r
177                 {\r
178                     tuple.NewFullPath = info.FullName;\r
179                     var relativePath = info.AsRelativeTo(moveArg.FullPath);\r
180                     tuple.OldFullPath = Path.Combine(moveArg.OldFullPath, relativePath);\r
181                     tuple.OldChecksum = states.FirstOrDefault(\r
182                         st => st.FilePath.Equals(tuple.OldFullPath, StringComparison.InvariantCultureIgnoreCase))\r
183                         .NullSafe(st => st.Checksum);\r
184                 }*/\r
185 \r
186                 tuplesByPath[tuple.FilePath] = tuple;\r
187             }\r
188         }\r
189 \r
190         private void MergeLocalStates(IEnumerable<FileState> states, Dictionary<string, StateTuple> tuplesByPath)\r
191         {\r
192             //For files that have state\r
193             foreach (var state in states)\r
194             {\r
195                 StateTuple hashTuple;\r
196 \r
197 \r
198                 if (tuplesByPath.TryGetValue(state.FilePath, out hashTuple))\r
199                 {\r
200                     hashTuple.FileState = state;\r
201                     UpdateHashes(hashTuple);\r
202                 }\r
203                /* else if (moves.ContainsKey(state.FilePath) &&\r
204                          tuplesByPath.TryGetValue(moves[state.FilePath].FullPath, out hashTuple))\r
205                 {\r
206                     hashTuple.FileState = state;\r
207                     UpdateHashes(hashTuple);\r
208                 }*/\r
209                 else\r
210                 {\r
211                     var fsInfo = FileInfoExtensions.FromPath(state.FilePath);\r
212                     hashTuple = new StateTuple { FileInfo = fsInfo, FileState = state };\r
213 \r
214                     //Is the source of a moved item?\r
215                    /* var moveArg =\r
216                         moves.Values.FirstOrDefault(\r
217                             arg => state.FilePath.Equals(arg.OldFullPath, StringComparison.InvariantCultureIgnoreCase)\r
218                                    || state.FilePath.IsAtOrBelow(arg.OldFullPath));\r
219                     if (moveArg != null)\r
220                     {\r
221                         var relativePath = state.FilePath.AsRelativeTo(moveArg.OldFullPath);\r
222                         hashTuple.NewFullPath = Path.Combine(moveArg.FullPath, relativePath);\r
223                         hashTuple.OldFullPath = state.FilePath;\r
224                         //Do we have the old MD5?\r
225                         //hashTuple.OldMD5 = state.LastMD5;\r
226                     }\r
227 */\r
228 \r
229                     tuplesByPath[state.FilePath] = hashTuple;\r
230                 }\r
231             }\r
232             //for files that don't have state\r
233             var statelessTuples = tuplesByPath.Values.Where(t => t.FileState == null).ToArray();\r
234             //If there are too many stateless tuples, update them in parallel\r
235             if (statelessTuples.Length > 20)\r
236                 Parallel.ForEach(statelessTuples, UpdateHashes);\r
237             else\r
238                 statelessTuples.ApplyAction(UpdateHashes);\r
239         }\r
240 \r
241 \r
242         /// <summary>\r
243         /// Update the tuple with the file's hashes, avoiding calculation if the file is unchanged\r
244         /// </summary>\r
245         /// <param name="hashTuple"></param>\r
246         /// <remarks>\r
247         /// The function first checks the file's size and last write date to see if there are any changes. If there are none,\r
248         /// the file's stored hashes are used.\r
249         /// Otherwise, MD5 is calculated first to ensure there are no changes. If MD5 is different, the Merkle hash is calculated\r
250         /// </remarks>\r
251         private void UpdateHashes(StateTuple hashTuple)\r
252         {\r
253 \r
254             try\r
255             {\r
256                 var state = hashTuple.NullSafe(s => s.FileState);\r
257                 var storedHash = state.NullSafe(s => s.Checksum);\r
258                 var storedHashes = state.NullSafe(s => s.Hashes);\r
259                 //var storedMD5 = state.NullSafe(s => s.LastMD5);\r
260                 var storedDate = state.NullSafe(s => s.LastWriteDate) ?? DateTime.MinValue;\r
261                 var storedLength = state.NullSafe(s => s.LastLength);\r
262 \r
263                 //var md5Hash = Signature.MD5_EMPTY;                \r
264                 var merkle = TreeHash.Empty;\r
265 \r
266                 if (hashTuple.FileInfo is FileInfo)\r
267                 {\r
268                     var file = (FileInfo)hashTuple.FileInfo.WithProperCapitalization();\r
269 \r
270                     //Attributes unchanged?\r
271                     //LastWriteTime is only accurate to the second\r
272                     var unchangedAttributes = file.LastWriteTime - storedDate < TimeSpan.FromSeconds(1)\r
273                         && storedLength == file.Length;\r
274 \r
275                     //Attributes appear unchanged but the file length doesn't match the stored hash ?\r
276                     var nonEmptyMismatch = unchangedAttributes &&\r
277                         (file.Length == 0 ^ storedHash == Signature.MERKLE_EMPTY);\r
278 \r
279                     //Missing hashes for NON-EMPTY hash ?\r
280                     var missingHashes = storedHash != Signature.MERKLE_EMPTY &&\r
281                         String.IsNullOrWhiteSpace(storedHashes);\r
282 \r
283                     //Unchanged attributes but changed MD5 \r
284                     //Short-circuiting ensures MD5 is computed only if the attributes are changed\r
285 \r
286                     //var md5Mismatch = (!unchangedAttributes && file.ComputeShortHash(StatusNotification) != storedMD5);\r
287 \r
288 \r
289                     //If the attributes are unchanged but the Merkle doesn't match the size,\r
290                     //or the attributes and the MD5 hash have changed, \r
291                     //or the hashes are missing but the tophash is NOT empty, we need to recalculate\r
292                     //\r
293                     //Otherwise we load the hashes from state\r
294                     if (!unchangedAttributes || nonEmptyMismatch || missingHashes)\r
295                         merkle = RecalculateTreehash(file);\r
296                     else\r
297                     {\r
298                         merkle = TreeHash.Parse(hashTuple.FileState.Hashes);\r
299                         //merkle.MD5 = storedMD5;\r
300                     }\r
301 \r
302 \r
303                     //md5Hash = merkle.MD5;\r
304                 }\r
305                 //hashTuple.MD5 = md5Hash;\r
306                 //Setting Merkle also updates C\r
307                 hashTuple.Merkle = merkle;\r
308             }\r
309             catch (IOException)\r
310             {\r
311                 hashTuple.Locked = true;\r
312             }\r
313         }\r
314 \r
315         /// <summary>\r
316         /// Recalculate a file's treehash and md5 and update the database\r
317         /// </summary>\r
318         /// <param name="file"></param>\r
319         /// <returns></returns>\r
320         private TreeHash RecalculateTreehash(FileInfo file)\r
321         {\r
322             var progress = new Progress<HashProgress>(d => _statusNotification.Notify(\r
323                                                     new StatusNotification(String.Format("Hashing {0:p} of {1}", d.Percentage, file.Name))));\r
324             var merkle = Signature.CalculateTreeHash(file, _statusKeeper.BlockSize, _statusKeeper.BlockHash,\r
325                 _settings.HashingParallelism, _token, progress);\r
326 \r
327             _statusKeeper.UpdateFileHashes(file, merkle);\r
328             return merkle;\r
329         }\r
330 \r
331         private void MergeCloudFiles(IEnumerable<Tuple<string, ObjectInfo>> infos, Dictionary<string, StateTuple> tuplesByPath)\r
332         {\r
333             var tuplesById = tuplesByPath.Values\r
334                 .Where(tuple => tuple.FileState != null && tuple.FileState.ObjectID != null)\r
335                 .ToDictionary(tuple => tuple.FileState.ObjectID, tuple => tuple); //new Dictionary<Guid, StateTuple>();\r
336 \r
337             foreach (var info in infos)\r
338             {\r
339                 StateTuple hashTuple;\r
340                 var filePath = info.Item1;\r
341                 var objectInfo = info.Item2;\r
342                 var objectId = objectInfo.UUID;\r
343 \r
344                 if (objectId != _emptyGuid && tuplesById.TryGetValue(objectId, out hashTuple))\r
345                 {\r
346                     //This will handle renamed objects\r
347                     hashTuple.ObjectInfo = objectInfo;\r
348                 }\r
349                 else if (tuplesByPath.TryGetValue(filePath, out hashTuple))\r
350                 {\r
351                     hashTuple.ObjectInfo = objectInfo;\r
352                 }\r
353                 else\r
354                 {\r
355                     var fsInfo = FileInfoExtensions.FromPath(filePath);\r
356                     hashTuple = new StateTuple { FileInfo = fsInfo, ObjectInfo = objectInfo };\r
357                     tuplesByPath[filePath] = hashTuple;\r
358 \r
359                     if (objectInfo.UUID != _emptyGuid)\r
360                         tuplesById[objectInfo.UUID] = hashTuple;\r
361                 }\r
362             }\r
363         }\r
364 \r
365 \r
366     }\r
367 }\r