DupFile/df.py at master · RJay45/DupFile · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python
#
# (c) Jason "RJay45" Griffith 2017
#
import argparse
import os.path
from os import scandir
from os.path import isfile
import hashlib
import shutil
import sys
import json
from time import gmtime, strftime


def progress(filename, size):
    progress.count += 1
    if progress.count >= 25 or size > 1073741824:
        progress.count = 0
        try:
            output = "Current File: " + filename
            width = shutil.get_terminal_size((80, 20)).columns - 1
            if len(output) > width:
                output = output[:width]
            sys.stdout.write("\r" + output + ' ' * (shutil.get_terminal_size((80, 20)).columns - len(output) - 1))
            sys.stdout.flush()
        except UnicodeEncodeError:
            # FIXME: Some file names have UTF-16 characters in them which cause exceptions when writing to STDOUT.
            return


progress.count = 25


def hash_file(file_name, length):
    try:
        file_hash = hashlib.sha1()
        bytes_read = 0
        chunk = 4096

        with open(file_name, "rb") as file:
            while bytes_read < length:
                file_hash.update(file.read(chunk))
                bytes_read += chunk

        return file_hash.hexdigest()
    except Exception:
        return "Unable to hash file"


def traverse(current_path, results, recurse=True):
    try:
        for file in scandir(current_path):
            if file.is_file(follow_symlinks=False):
                # Convert this to a string to make merging loaded JSON debug files easier.
                try:
                    size = file.stat(follow_symlinks=False).st_size
                except OSError:
                    print("Unable to get size of file " + file.path + ", skipping.")
                    continue

                progress(file.path, size)

                # We only compute hashes when there are two files of the same size,
                # otherwise we'll leave an indicator and run the hash later if we find a second same-sized file.
                if size not in results:
                    results[size] = {
                        'first': file.path
                    }
                    continue

                elif 'first' in results[size]:
                    results[size][hash_file(results[size]['first'], size)] = [results[size]['first']]
                    results[size].pop('first', None)

                hsum = hash_file(file.path, size)
                if hsum not in results[size]:
                    results[size][hsum] = [file.path]
                else:
                    results[size][hsum].append(file.path)
            elif recurse:
                # Don't follow symlinks.
                if file.is_symlink():
                    return

                traverse(file.path, results)
    except Exception as ex:
        print("\nUnable to scan directory: " + current_path)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('PATH', help="Path to search for duplicate files. If multiple paths are given, files from all "
                                     "paths will be compared to each other.", nargs='+')
    parser.add_argument('-d', '--debug', help="Enable debug mode. This will output a debug file containing the results"
                        " array", action="store_true")
    parser.add_argument('-r', '--resume', help="Resume a previous run by using a generated debug file",
                        default=None, type=str)
    parser.add_argument('-R', '--no-recurse', help="Disable file recursion",
                        default=False, action="store_true")
    args = parser.parse_args()

    # Quick check that the paths are valid before we continue
    errors = ""
    for path in args.PATH:
        if not os.path.isdir(path):
            errors += "\n" + path + " is not a directory."

    if errors != "":
        print("Error:" + errors)
        exit()

    results = {}

    if args.resume is not None:
        if not isfile(args.resume):
            print("Cannot load previous results, file " + args.resume + " does not exist.")
            exit()

        print("Loading results from " + args.resume + "...")
        try:
            f = open(args.resume, "r")
            results = json.load(f)
            f.close()
        except Exception:
            print("Unable to read or parse debug file.")
            exit()
        print("Loading complete.")

    for path in args.PATH:
        traverse(path, results, not args.no_recurse)

    datestamp = strftime("%Y%m%d_%H%M%S", gmtime())

    f = open("./output_" + datestamp + ".csv", "w")

    f.write('Hash,Size,Name\n')

    for size in results:
        if 'first' in results[size]:
            continue

        for hsum in results[size]:
            if len(results[size][hsum]) > 1:
                for file_name in results[size][hsum]:
                    f.write(hsum + "," + str(size) + "," + file_name + '\n')

    f.close()
    if args.debug:
        f = open("./debug_" + datestamp + ".txt", "w")
        f.write(json.dumps(results, indent=2))
        f.close()


main()