mirror of
https://github.com/newren/git-filter-repo.git
synced 2024-07-06 18:32:14 +02:00
filter-repo: group repo analysis functions into a class
Signed-off-by: Elijah Newren <newren@gmail.com>
This commit is contained in:
parent
9887dd5cbe
commit
4e2110136e
734
git-filter-repo
734
git-filter-repo
@ -1891,10 +1891,15 @@ class FilteringOptions(object):
|
||||
FilteringOptions.sanity_check_args(args)
|
||||
return args
|
||||
|
||||
def analyze_commit(stats, graph, commit, parents, date, file_changes):
|
||||
def equiv_class(filename):
|
||||
class RepoAnalyze(object):
|
||||
|
||||
# First, several helper functions for analyze_commit()
|
||||
|
||||
@staticmethod
|
||||
def equiv_class(stats, filename):
|
||||
return stats['equivalence'].get(filename, (filename,))
|
||||
|
||||
@staticmethod
|
||||
def setup_equivalence_for_rename(stats, oldname, newname):
|
||||
# if A is renamed to B and B is renamed to C, then the user thinks of
|
||||
# A, B, and C as all being different names for the same 'file'. We record
|
||||
@ -1911,18 +1916,22 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
|
||||
for f in new_tuple:
|
||||
stats['equivalence'][f] = new_tuple
|
||||
|
||||
@staticmethod
|
||||
def setup_or_update_rename_history(stats, commit, oldname, newname):
|
||||
rename_commits = stats['rename_history'].get(oldname, set())
|
||||
rename_commits.add(commit)
|
||||
stats['rename_history'][oldname] = rename_commits
|
||||
|
||||
@staticmethod
|
||||
def handle_renames(stats, commit, change_types, filenames):
|
||||
for index, change_type in enumerate(change_types):
|
||||
if change_type == 'R':
|
||||
oldname, newname = filenames[index], filenames[-1]
|
||||
setup_equivalence_for_rename(stats, oldname, newname)
|
||||
setup_or_update_rename_history(stats, commit, oldname, newname)
|
||||
RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname)
|
||||
RepoAnalyze.setup_or_update_rename_history(stats, commit,
|
||||
oldname, newname)
|
||||
|
||||
@staticmethod
|
||||
def handle_file(stats, graph, commit, modes, shas, filenames):
|
||||
mode, sha, filename = modes[-1], shas[-1], filenames[-1]
|
||||
|
||||
@ -1936,7 +1945,7 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
|
||||
|
||||
# If the file (or equivalence class of files) was recorded as deleted,
|
||||
# clearly it isn't anymore
|
||||
equiv = equiv_class(filename)
|
||||
equiv = RepoAnalyze.equiv_class(stats, filename)
|
||||
for f in equiv:
|
||||
stats[delmode].pop(f, None)
|
||||
|
||||
@ -1954,383 +1963,392 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
|
||||
if f in stats['equivalence']:
|
||||
del stats['equivalence'][f]
|
||||
|
||||
graph.add_commit_and_parents(commit, parents)
|
||||
for change in file_changes:
|
||||
modes, shas, change_types, filenames = change
|
||||
if len(parents) == 1 and change_types.startswith('R'):
|
||||
change_types = 'R' # remove the rename score; we don't care
|
||||
if modes[-1] == '160000':
|
||||
continue
|
||||
elif modes[-1] == '000000':
|
||||
# Track when files/directories are deleted; see 'R' below about equiv_class
|
||||
for f in equiv_class(filenames[-1]):
|
||||
if any(x == '040000' for x in modes[0:-1]):
|
||||
stats['tree_deletions'][f] = date
|
||||
else:
|
||||
stats['file_deletions'][f] = date
|
||||
elif change_types.strip('AMT') == '':
|
||||
handle_file(stats, graph, commit, modes, shas, filenames)
|
||||
elif modes[-1] == '040000' and change_types.strip('RAM') == '':
|
||||
handle_file(stats, graph, commit, modes, shas, filenames)
|
||||
elif change_types.strip('RAM') == '':
|
||||
handle_file(stats, graph, commit, modes, shas, filenames)
|
||||
handle_renames(stats, commit, change_types, filenames)
|
||||
else:
|
||||
raise SystemExit("Unhandled change type(s): {} (in commit {})"
|
||||
.format(change_types, commit))
|
||||
|
||||
def gather_data(args):
|
||||
blob_size_progress = ProgressWriter()
|
||||
num_blobs = 0
|
||||
|
||||
# Get sizes of blobs by sha1
|
||||
a='--batch-check=%(objectname) %(objecttype) %(objectsize) %(objectsize:disk)'
|
||||
cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', a],
|
||||
bufsize = -1,
|
||||
stdout = subprocess.PIPE)
|
||||
unpacked_size = {}
|
||||
packed_size = {}
|
||||
for line in cf.stdout:
|
||||
sha, objtype, objsize, objdisksize = line.split()
|
||||
objsize, objdisksize = int(objsize), int(objdisksize)
|
||||
if objtype == 'blob':
|
||||
unpacked_size[sha] = objsize
|
||||
packed_size[sha] = objdisksize
|
||||
num_blobs += 1
|
||||
blob_size_progress.show("Processed {} blob sizes".format(num_blobs))
|
||||
cf.wait()
|
||||
blob_size_progress.finish()
|
||||
stats = {'names': collections.defaultdict(set),
|
||||
'allnames' : set(),
|
||||
'file_deletions': {},
|
||||
'tree_deletions': {},
|
||||
'equivalence': {},
|
||||
'rename_history': collections.defaultdict(set),
|
||||
'unpacked_size': unpacked_size,
|
||||
'packed_size': packed_size,
|
||||
'num_commits': 0}
|
||||
|
||||
# Setup the rev-list/diff-tree process
|
||||
commit_parse_progress = ProgressWriter()
|
||||
num_commits = 0
|
||||
cmd = 'git rev-list --topo-order --reverse {} | git diff-tree --stdin --always --root --format="%H%n%P%n%cd" --date=short -M -t -c --raw --combined-all-paths'.format(' '.join(args.refs))
|
||||
dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
|
||||
f = dtp.stdout
|
||||
line = f.next()
|
||||
cont = bool(line)
|
||||
graph = AncestryGraph()
|
||||
while cont:
|
||||
commit = line.rstrip()
|
||||
parents = f.next().split()
|
||||
date = f.next().rstrip()
|
||||
|
||||
# We expect a blank line next; if we get a non-blank line then
|
||||
# this commit modified no files and we need to move on to the next.
|
||||
# If there is no line, we've reached end-of-input.
|
||||
try:
|
||||
line = f.next().rstrip()
|
||||
cont = True
|
||||
except StopIteration:
|
||||
cont = False
|
||||
|
||||
# If we haven't reached end of input, and we got a blank line meaning
|
||||
# a commit that has modified files, then get the file changes associated
|
||||
# with this commit.
|
||||
file_changes = []
|
||||
if cont and not line:
|
||||
cont = False
|
||||
for line in f:
|
||||
if not line.startswith(':'):
|
||||
cont = True
|
||||
break
|
||||
n = 1+max(1, len(parents))
|
||||
assert line.startswith(':'*(n-1))
|
||||
relevant = line[n-1:-1]
|
||||
splits = relevant.split(None, n)
|
||||
modes = splits[0:n]
|
||||
splits = splits[n].split(None, n)
|
||||
shas = splits[0:n]
|
||||
splits = splits[n].split('\t')
|
||||
change_types = splits[0]
|
||||
filenames = [PathQuoting.dequote(x) for x in splits[1:]]
|
||||
file_changes.append([modes, shas, change_types, filenames])
|
||||
|
||||
# Analyze this commit and update progress
|
||||
analyze_commit(stats, graph, commit, parents, date, file_changes)
|
||||
num_commits += 1
|
||||
commit_parse_progress.show("Processed {} commits".format(num_commits))
|
||||
|
||||
# Show the final commits processed message and record the number of commits
|
||||
commit_parse_progress.finish()
|
||||
stats['num_commits'] = num_commits
|
||||
|
||||
# Close the output, ensure rev-list|diff-tree pipeline completed successfully
|
||||
dtp.stdout.close()
|
||||
if dtp.wait():
|
||||
raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.")
|
||||
|
||||
return stats
|
||||
|
||||
def write_report(reportdir, stats):
|
||||
def datestr(datetimestr):
|
||||
return datetimestr if datetimestr else '<present>'
|
||||
|
||||
def dirnames(path):
|
||||
while True:
|
||||
path = os.path.dirname(path)
|
||||
yield path
|
||||
if path == '':
|
||||
break
|
||||
|
||||
# Compute aggregate size information for paths, extensions, and dirs
|
||||
total_size = {'packed': 0, 'unpacked': 0}
|
||||
path_size = {'packed': collections.defaultdict(int),
|
||||
'unpacked': collections.defaultdict(int)}
|
||||
ext_size = {'packed': collections.defaultdict(int),
|
||||
'unpacked': collections.defaultdict(int)}
|
||||
dir_size = {'packed': collections.defaultdict(int),
|
||||
'unpacked': collections.defaultdict(int)}
|
||||
for sha in stats['names']:
|
||||
size = {'packed': stats['packed_size'][sha],
|
||||
'unpacked': stats['unpacked_size'][sha]}
|
||||
for which in ('packed', 'unpacked'):
|
||||
for name in stats['names'][sha]:
|
||||
total_size[which] += size[which]
|
||||
path_size[which][name] += size[which]
|
||||
basename, ext = os.path.splitext(name)
|
||||
ext_size[which][ext] += size[which]
|
||||
for dirname in dirnames(name):
|
||||
dir_size[which][dirname] += size[which]
|
||||
|
||||
# Determine if and when extensions and directories were deleted
|
||||
ext_deleted_data = {}
|
||||
for name in stats['allnames']:
|
||||
when = stats['file_deletions'].get(name, None)
|
||||
|
||||
# Update the extension
|
||||
basename, ext = os.path.splitext(name)
|
||||
if when is None:
|
||||
ext_deleted_data[ext] = None
|
||||
elif ext in ext_deleted_data:
|
||||
if ext_deleted_data[ext] is not None:
|
||||
ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
|
||||
else:
|
||||
ext_deleted_data[ext] = when
|
||||
|
||||
dir_deleted_data = {}
|
||||
for name in dir_size['packed']:
|
||||
dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
|
||||
|
||||
with open(os.path.join(reportdir, "README"), 'w') as f:
|
||||
# Give a basic overview of this file
|
||||
f.write("== Overal Statistics ==\n")
|
||||
f.write(" Number of commits: {}\n".format(stats['num_commits']))
|
||||
f.write(" Number of filenames: {}\n".format(len(path_size['packed'])))
|
||||
f.write(" Number of directories: {}\n".format(len(dir_size['packed'])))
|
||||
f.write(" Number of file extensions: {}\n".format(len(ext_size['packed'])))
|
||||
f.write("\n")
|
||||
f.write(" Total unpacked size (bytes): {:10d}\n"
|
||||
.format(total_size['unpacked']))
|
||||
f.write(" Total packed size (bytes): {:10d}\n"
|
||||
.format(total_size['packed']))
|
||||
f.write("\n")
|
||||
|
||||
# Mention issues with the report
|
||||
f.write("== Caveats ==\n")
|
||||
f.write("=== Sizes ===\n")
|
||||
f.write(textwrap.dedent("""
|
||||
Packed size represents what size your repository would be if no
|
||||
trees, commits, tags, or other metadata were included (though it may
|
||||
fail to represent de-duplication; see below). It also represents the
|
||||
current packing, which may be suboptimal if you haven't gc'ed for a
|
||||
while.
|
||||
|
||||
Unpacked size represents what size your repository would be if no if
|
||||
no trees, commits, tags, or other metadata were included AND if no
|
||||
files were packed; i.e., without delta-ing or compression.
|
||||
|
||||
Both unpacked and packed sizes can be slightly misleading. Deleting
|
||||
a blob from history not save as much space as the unpacked size,
|
||||
because it is obviously normally stored in packed form. Also,
|
||||
deleting a blob from history may not save as much space as its packed
|
||||
size either, because another blob could be stored as a delta against
|
||||
that blob, so when you remove one blob another blob's packed size may
|
||||
grow.
|
||||
|
||||
Also, the sum of the packed sizes can add up to more than the
|
||||
repository size; if the same contents appeared in the repository in
|
||||
multiple places, git will automatically de-dupe and store only one
|
||||
copy, while the way sizes are added in this analysis adds the size
|
||||
for each file path that has those contents. Further, if a file is
|
||||
ever reverted to a previous version's contents, the previous
|
||||
version's size will be counted multiple times in this analysis, even
|
||||
though git will only store it once.
|
||||
"""[1:]))
|
||||
f.write("\n")
|
||||
f.write("=== Deletions ===\n")
|
||||
f.write(textwrap.dedent("""
|
||||
Whether a file is deleted is not a binary quality, since it can be
|
||||
deleted on some branches but still exist in others. Also, it might
|
||||
exist in an old tag, but have been deleted in versions newer than
|
||||
that. More thorough tracking could be done, including looking at
|
||||
merge commits where one side of history deleted and the other modified,
|
||||
in order to give a more holistic picture of deletions. However, that
|
||||
algorithm would not only be more complex to implement, it'd also be
|
||||
quite difficult to present and interpret by users. Since --analyze
|
||||
is just about getting a high-level rough picture of history, it instead
|
||||
implements the simplistic rule that is good enough for 98% of cases:
|
||||
A file is marked as deleted if the last commit in the fast-export
|
||||
stream that mentions the file lists it as deleted.
|
||||
This makes it dependent on topological ordering, but generally gives
|
||||
the "right" answer.
|
||||
"""[1:]))
|
||||
f.write("\n")
|
||||
f.write("=== Renames ===\n")
|
||||
f.write(textwrap.dedent("""
|
||||
Renames share the same non-binary nature that deletions do, plus
|
||||
additional challenges:
|
||||
* If the renamed file is renamed again, instead of just two names for
|
||||
a path you can have three or more.
|
||||
* Rename pairs of the form (oldname, newname) that we consider to be
|
||||
different names of the "same file" might only be valid over certain
|
||||
commit ranges. For example, if a new commit reintroduces a file
|
||||
named oldname, then new versions of oldname aren't the "same file"
|
||||
anymore. We could try to portray this to the user, but it's easier
|
||||
for the user to just break the pairing and only report unbroken
|
||||
rename pairings to the user.
|
||||
* The ability for users to rename files differently in different
|
||||
branches means that our chains of renames will not necessarily be
|
||||
linear but may branch out.
|
||||
"""[1:]))
|
||||
f.write("\n")
|
||||
|
||||
# Equivalence classes for names, so if folks only want to keep a
|
||||
# certain set of paths, they know the old names they want to include
|
||||
# too.
|
||||
with open(os.path.join(reportdir, "renames.txt"), 'w') as f:
|
||||
seen = set()
|
||||
for pathname,equiv_group in sorted(stats['equivalence'].iteritems(),
|
||||
key=lambda x:x[1]):
|
||||
if equiv_group in seen:
|
||||
@staticmethod
|
||||
def analyze_commit(stats, graph, commit, parents, date, file_changes):
|
||||
graph.add_commit_and_parents(commit, parents)
|
||||
for change in file_changes:
|
||||
modes, shas, change_types, filenames = change
|
||||
if len(parents) == 1 and change_types.startswith('R'):
|
||||
change_types = 'R' # remove the rename score; we don't care
|
||||
if modes[-1] == '160000':
|
||||
continue
|
||||
seen.add(equiv_group)
|
||||
f.write("{} ->\n ".format(equiv_group[0]) +
|
||||
"\n ".join(equiv_group[1:]) +
|
||||
"\n")
|
||||
elif modes[-1] == '000000':
|
||||
# Track when files/directories are deleted
|
||||
for f in RepoAnalyze.equiv_class(stats, filenames[-1]):
|
||||
if any(x == '040000' for x in modes[0:-1]):
|
||||
stats['tree_deletions'][f] = date
|
||||
else:
|
||||
stats['file_deletions'][f] = date
|
||||
elif change_types.strip('AMT') == '':
|
||||
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
|
||||
elif modes[-1] == '040000' and change_types.strip('RAM') == '':
|
||||
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
|
||||
elif change_types.strip('RAM') == '':
|
||||
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
|
||||
RepoAnalyze.handle_renames(stats, commit, change_types, filenames)
|
||||
else:
|
||||
raise SystemExit("Unhandled change type(s): {} (in commit {})"
|
||||
.format(change_types, commit))
|
||||
|
||||
# List directories in reverse sorted order of unpacked size
|
||||
with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f:
|
||||
f.write("=== Deleted directories by reverse size ===\n")
|
||||
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
|
||||
for dirname, size in sorted(dir_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
if (dir_deleted_data[dirname]):
|
||||
@staticmethod
|
||||
def gather_data(args):
|
||||
blob_size_progress = ProgressWriter()
|
||||
num_blobs = 0
|
||||
|
||||
# Get sizes of blobs by sha1
|
||||
cmd = '--batch-check=%(objectname) %(objecttype) ' + \
|
||||
'%(objectsize) %(objectsize:disk)'
|
||||
cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', cmd],
|
||||
bufsize = -1,
|
||||
stdout = subprocess.PIPE)
|
||||
unpacked_size = {}
|
||||
packed_size = {}
|
||||
for line in cf.stdout:
|
||||
sha, objtype, objsize, objdisksize = line.split()
|
||||
objsize, objdisksize = int(objsize), int(objdisksize)
|
||||
if objtype == 'blob':
|
||||
unpacked_size[sha] = objsize
|
||||
packed_size[sha] = objdisksize
|
||||
num_blobs += 1
|
||||
blob_size_progress.show("Processed {} blob sizes".format(num_blobs))
|
||||
cf.wait()
|
||||
blob_size_progress.finish()
|
||||
stats = {'names': collections.defaultdict(set),
|
||||
'allnames' : set(),
|
||||
'file_deletions': {},
|
||||
'tree_deletions': {},
|
||||
'equivalence': {},
|
||||
'rename_history': collections.defaultdict(set),
|
||||
'unpacked_size': unpacked_size,
|
||||
'packed_size': packed_size,
|
||||
'num_commits': 0}
|
||||
|
||||
# Setup the rev-list/diff-tree process
|
||||
commit_parse_progress = ProgressWriter()
|
||||
num_commits = 0
|
||||
cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) +
|
||||
' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' +
|
||||
' --date=short -M -t -c --raw --combined-all-paths')
|
||||
dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
|
||||
f = dtp.stdout
|
||||
line = f.next()
|
||||
cont = bool(line)
|
||||
graph = AncestryGraph()
|
||||
while cont:
|
||||
commit = line.rstrip()
|
||||
parents = f.next().split()
|
||||
date = f.next().rstrip()
|
||||
|
||||
# We expect a blank line next; if we get a non-blank line then
|
||||
# this commit modified no files and we need to move on to the next.
|
||||
# If there is no line, we've reached end-of-input.
|
||||
try:
|
||||
line = f.next().rstrip()
|
||||
cont = True
|
||||
except StopIteration:
|
||||
cont = False
|
||||
|
||||
# If we haven't reached end of input, and we got a blank line meaning
|
||||
# a commit that has modified files, then get the file changes associated
|
||||
# with this commit.
|
||||
file_changes = []
|
||||
if cont and not line:
|
||||
cont = False
|
||||
for line in f:
|
||||
if not line.startswith(':'):
|
||||
cont = True
|
||||
break
|
||||
n = 1+max(1, len(parents))
|
||||
assert line.startswith(':'*(n-1))
|
||||
relevant = line[n-1:-1]
|
||||
splits = relevant.split(None, n)
|
||||
modes = splits[0:n]
|
||||
splits = splits[n].split(None, n)
|
||||
shas = splits[0:n]
|
||||
splits = splits[n].split('\t')
|
||||
change_types = splits[0]
|
||||
filenames = [PathQuoting.dequote(x) for x in splits[1:]]
|
||||
file_changes.append([modes, shas, change_types, filenames])
|
||||
|
||||
# Analyze this commit and update progress
|
||||
RepoAnalyze.analyze_commit(stats, graph, commit, parents, date,
|
||||
file_changes)
|
||||
num_commits += 1
|
||||
commit_parse_progress.show("Processed {} commits".format(num_commits))
|
||||
|
||||
# Show the final commits processed message and record the number of commits
|
||||
commit_parse_progress.finish()
|
||||
stats['num_commits'] = num_commits
|
||||
|
||||
# Close the output, ensure rev-list|diff-tree pipeline completed successfully
|
||||
dtp.stdout.close()
|
||||
if dtp.wait():
|
||||
raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.")
|
||||
|
||||
return stats
|
||||
|
||||
@staticmethod
|
||||
def write_report(reportdir, stats):
|
||||
def datestr(datetimestr):
|
||||
return datetimestr if datetimestr else '<present>'
|
||||
|
||||
def dirnames(path):
|
||||
while True:
|
||||
path = os.path.dirname(path)
|
||||
yield path
|
||||
if path == '':
|
||||
break
|
||||
|
||||
# Compute aggregate size information for paths, extensions, and dirs
|
||||
total_size = {'packed': 0, 'unpacked': 0}
|
||||
path_size = {'packed': collections.defaultdict(int),
|
||||
'unpacked': collections.defaultdict(int)}
|
||||
ext_size = {'packed': collections.defaultdict(int),
|
||||
'unpacked': collections.defaultdict(int)}
|
||||
dir_size = {'packed': collections.defaultdict(int),
|
||||
'unpacked': collections.defaultdict(int)}
|
||||
for sha in stats['names']:
|
||||
size = {'packed': stats['packed_size'][sha],
|
||||
'unpacked': stats['unpacked_size'][sha]}
|
||||
for which in ('packed', 'unpacked'):
|
||||
for name in stats['names'][sha]:
|
||||
total_size[which] += size[which]
|
||||
path_size[which][name] += size[which]
|
||||
basename, ext = os.path.splitext(name)
|
||||
ext_size[which][ext] += size[which]
|
||||
for dirname in dirnames(name):
|
||||
dir_size[which][dirname] += size[which]
|
||||
|
||||
# Determine if and when extensions and directories were deleted
|
||||
ext_deleted_data = {}
|
||||
for name in stats['allnames']:
|
||||
when = stats['file_deletions'].get(name, None)
|
||||
|
||||
# Update the extension
|
||||
basename, ext = os.path.splitext(name)
|
||||
if when is None:
|
||||
ext_deleted_data[ext] = None
|
||||
elif ext in ext_deleted_data:
|
||||
if ext_deleted_data[ext] is not None:
|
||||
ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
|
||||
else:
|
||||
ext_deleted_data[ext] = when
|
||||
|
||||
dir_deleted_data = {}
|
||||
for name in dir_size['packed']:
|
||||
dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
|
||||
|
||||
with open(os.path.join(reportdir, "README"), 'w') as f:
|
||||
# Give a basic overview of this file
|
||||
f.write("== Overal Statistics ==\n")
|
||||
f.write(" Number of commits: {}\n".format(stats['num_commits']))
|
||||
f.write(" Number of filenames: {}\n".format(len(path_size['packed'])))
|
||||
f.write(" Number of directories: {}\n".format(len(dir_size['packed'])))
|
||||
f.write(" Number of file extensions: {}\n".format(len(ext_size['packed'])))
|
||||
f.write("\n")
|
||||
f.write(" Total unpacked size (bytes): {:10d}\n"
|
||||
.format(total_size['unpacked']))
|
||||
f.write(" Total packed size (bytes): {:10d}\n"
|
||||
.format(total_size['packed']))
|
||||
f.write("\n")
|
||||
|
||||
# Mention issues with the report
|
||||
f.write("== Caveats ==\n")
|
||||
f.write("=== Sizes ===\n")
|
||||
f.write(textwrap.dedent("""
|
||||
Packed size represents what size your repository would be if no
|
||||
trees, commits, tags, or other metadata were included (though it may
|
||||
fail to represent de-duplication; see below). It also represents the
|
||||
current packing, which may be suboptimal if you haven't gc'ed for a
|
||||
while.
|
||||
|
||||
Unpacked size represents what size your repository would be if no if
|
||||
no trees, commits, tags, or other metadata were included AND if no
|
||||
files were packed; i.e., without delta-ing or compression.
|
||||
|
||||
Both unpacked and packed sizes can be slightly misleading. Deleting
|
||||
a blob from history not save as much space as the unpacked size,
|
||||
because it is obviously normally stored in packed form. Also,
|
||||
deleting a blob from history may not save as much space as its packed
|
||||
size either, because another blob could be stored as a delta against
|
||||
that blob, so when you remove one blob another blob's packed size may
|
||||
grow.
|
||||
|
||||
Also, the sum of the packed sizes can add up to more than the
|
||||
repository size; if the same contents appeared in the repository in
|
||||
multiple places, git will automatically de-dupe and store only one
|
||||
copy, while the way sizes are added in this analysis adds the size
|
||||
for each file path that has those contents. Further, if a file is
|
||||
ever reverted to a previous version's contents, the previous
|
||||
version's size will be counted multiple times in this analysis, even
|
||||
though git will only store it once.
|
||||
"""[1:]))
|
||||
f.write("\n")
|
||||
f.write("=== Deletions ===\n")
|
||||
f.write(textwrap.dedent("""
|
||||
Whether a file is deleted is not a binary quality, since it can be
|
||||
deleted on some branches but still exist in others. Also, it might
|
||||
exist in an old tag, but have been deleted in versions newer than
|
||||
that. More thorough tracking could be done, including looking at
|
||||
merge commits where one side of history deleted and the other modified,
|
||||
in order to give a more holistic picture of deletions. However, that
|
||||
algorithm would not only be more complex to implement, it'd also be
|
||||
quite difficult to present and interpret by users. Since --analyze
|
||||
is just about getting a high-level rough picture of history, it instead
|
||||
implements the simplistic rule that is good enough for 98% of cases:
|
||||
A file is marked as deleted if the last commit in the fast-export
|
||||
stream that mentions the file lists it as deleted.
|
||||
This makes it dependent on topological ordering, but generally gives
|
||||
the "right" answer.
|
||||
"""[1:]))
|
||||
f.write("\n")
|
||||
f.write("=== Renames ===\n")
|
||||
f.write(textwrap.dedent("""
|
||||
Renames share the same non-binary nature that deletions do, plus
|
||||
additional challenges:
|
||||
* If the renamed file is renamed again, instead of just two names for
|
||||
a path you can have three or more.
|
||||
* Rename pairs of the form (oldname, newname) that we consider to be
|
||||
different names of the "same file" might only be valid over certain
|
||||
commit ranges. For example, if a new commit reintroduces a file
|
||||
named oldname, then new versions of oldname aren't the "same file"
|
||||
anymore. We could try to portray this to the user, but it's easier
|
||||
for the user to just break the pairing and only report unbroken
|
||||
rename pairings to the user.
|
||||
* The ability for users to rename files differently in different
|
||||
branches means that our chains of renames will not necessarily be
|
||||
linear but may branch out.
|
||||
"""[1:]))
|
||||
f.write("\n")
|
||||
|
||||
# Equivalence classes for names, so if folks only want to keep a
|
||||
# certain set of paths, they know the old names they want to include
|
||||
# too.
|
||||
with open(os.path.join(reportdir, "renames.txt"), 'w') as f:
|
||||
seen = set()
|
||||
for pathname,equiv_group in sorted(stats['equivalence'].iteritems(),
|
||||
key=lambda x:x[1]):
|
||||
if equiv_group in seen:
|
||||
continue
|
||||
seen.add(equiv_group)
|
||||
f.write("{} ->\n ".format(equiv_group[0]) +
|
||||
"\n ".join(equiv_group[1:]) +
|
||||
"\n")
|
||||
|
||||
# List directories in reverse sorted order of unpacked size
|
||||
with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f:
|
||||
f.write("=== Deleted directories by reverse size ===\n")
|
||||
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
|
||||
for dirname, size in sorted(dir_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
if (dir_deleted_data[dirname]):
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(dir_size['unpacked'][dirname],
|
||||
size,
|
||||
datestr(dir_deleted_data[dirname]),
|
||||
dirname or '<toplevel>'))
|
||||
|
||||
with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f:
|
||||
f.write("=== All directories by reverse size ===\n")
|
||||
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
|
||||
for dirname, size in sorted(dir_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(dir_size['unpacked'][dirname],
|
||||
size,
|
||||
datestr(dir_deleted_data[dirname]),
|
||||
dirname or '<toplevel>'))
|
||||
|
||||
with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f:
|
||||
f.write("=== All directories by reverse size ===\n")
|
||||
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
|
||||
for dirname, size in sorted(dir_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(dir_size['unpacked'][dirname],
|
||||
size,
|
||||
datestr(dir_deleted_data[dirname]),
|
||||
dirname or '<toplevel>'))
|
||||
# List extensions in reverse sorted order of unpacked size
|
||||
with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f:
|
||||
f.write("=== Deleted extensions by reverse size ===\n")
|
||||
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
|
||||
for extname, size in sorted(ext_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
if (ext_deleted_data[extname]):
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(ext_size['unpacked'][extname],
|
||||
size,
|
||||
datestr(ext_deleted_data[extname]),
|
||||
extname or '<no extension>'))
|
||||
|
||||
# List extensions in reverse sorted order of unpacked size
|
||||
with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f:
|
||||
f.write("=== Deleted extensions by reverse size ===\n")
|
||||
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
|
||||
for extname, size in sorted(ext_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
if (ext_deleted_data[extname]):
|
||||
with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f:
|
||||
f.write("=== All extensions by reverse size ===\n")
|
||||
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
|
||||
for extname, size in sorted(ext_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(ext_size['unpacked'][extname],
|
||||
size,
|
||||
datestr(ext_deleted_data[extname]),
|
||||
extname or '<no extension>'))
|
||||
|
||||
with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f:
|
||||
f.write("=== All extensions by reverse size ===\n")
|
||||
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
|
||||
for extname, size in sorted(ext_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(ext_size['unpacked'][extname],
|
||||
size,
|
||||
datestr(ext_deleted_data[extname]),
|
||||
extname or '<no extension>'))
|
||||
# List files in reverse sorted order of unpacked size
|
||||
with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f:
|
||||
f.write("=== Deleted paths by reverse accumulated size ===\n")
|
||||
f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
|
||||
for pathname, size in sorted(path_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
when = stats['file_deletions'].get(pathname, None)
|
||||
if when:
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(path_size['unpacked'][pathname],
|
||||
size,
|
||||
datestr(when),
|
||||
pathname))
|
||||
|
||||
# List files in reverse sorted order of unpacked size
|
||||
with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f:
|
||||
f.write("=== Deleted paths by reverse accumulated size ===\n")
|
||||
f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
|
||||
for pathname, size in sorted(path_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
when = stats['file_deletions'].get(pathname, None)
|
||||
if when:
|
||||
with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f:
|
||||
f.write("=== All paths by reverse accumulated size ===\n")
|
||||
f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
|
||||
for pathname, size in sorted(path_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
when = stats['file_deletions'].get(pathname, None)
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(path_size['unpacked'][pathname],
|
||||
size,
|
||||
datestr(when),
|
||||
pathname))
|
||||
|
||||
with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f:
|
||||
f.write("=== All paths by reverse accumulated size ===\n")
|
||||
f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
|
||||
for pathname, size in sorted(path_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
when = stats['file_deletions'].get(pathname, None)
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(path_size['unpacked'][pathname],
|
||||
size,
|
||||
datestr(when),
|
||||
pathname))
|
||||
# List of filenames and sizes in descending order
|
||||
with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
|
||||
f.write("== Files by sha and associated pathnames in reverse size ==\n")
|
||||
f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
|
||||
for sha, size in sorted(stats['packed_size'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
if sha not in stats['names']:
|
||||
# Some objects in the repository might not be referenced, or not
|
||||
# referenced by the branches/tags the user cares about; skip them.
|
||||
continue
|
||||
names_with_sha = stats['names'][sha]
|
||||
if len(names_with_sha) == 1:
|
||||
names_with_sha = names_with_sha.pop()
|
||||
else:
|
||||
names_with_sha = sorted(list(names_with_sha))
|
||||
f.write(" {} {:10d} {:10d} {}\n".format(sha,
|
||||
stats['unpacked_size'][sha],
|
||||
size,
|
||||
names_with_sha))
|
||||
|
||||
# List of filenames and sizes in descending order
|
||||
with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
|
||||
f.write("== Files by sha and associated pathnames in reverse size ==\n")
|
||||
f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
|
||||
for sha, size in sorted(stats['packed_size'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
if sha not in stats['names']:
|
||||
# Some objects in the repository might not be referenced, or not
|
||||
# referenced by the branches/tags the user cares about; skip them.
|
||||
continue
|
||||
names_with_sha = stats['names'][sha]
|
||||
if len(names_with_sha) == 1:
|
||||
names_with_sha = names_with_sha.pop()
|
||||
else:
|
||||
names_with_sha = sorted(list(names_with_sha))
|
||||
f.write(" {} {:10d} {:10d} {}\n".format(sha,
|
||||
stats['unpacked_size'][sha],
|
||||
size,
|
||||
names_with_sha))
|
||||
@staticmethod
|
||||
def run(args, git_dir):
|
||||
# Create the report directory as necessary
|
||||
results_tmp_dir = os.path.join(git_dir, 'filter-repo')
|
||||
if not os.path.isdir(results_tmp_dir):
|
||||
os.mkdir(results_tmp_dir)
|
||||
reportdir = os.path.join(results_tmp_dir, "analysis")
|
||||
if not args.force and os.path.isdir(reportdir):
|
||||
raise SystemExit("Error: {} already exists; refusing to overwrite!".
|
||||
format(reportdir))
|
||||
os.mkdir(reportdir)
|
||||
|
||||
def do_analysis(args, git_dir):
|
||||
# Create the report directory as necessary
|
||||
results_tmp_dir = os.path.join(git_dir, 'filter-repo')
|
||||
if not os.path.isdir(results_tmp_dir):
|
||||
os.mkdir(results_tmp_dir)
|
||||
reportdir = os.path.join(results_tmp_dir, "analysis")
|
||||
if not args.force and os.path.isdir(reportdir):
|
||||
raise SystemExit("Error: {} already exists; refusing to overwrite!".
|
||||
format(reportdir))
|
||||
os.mkdir(reportdir)
|
||||
# Gather the data we need
|
||||
stats = RepoAnalyze.gather_data(args)
|
||||
|
||||
# Gather the data we need
|
||||
stats = gather_data(args)
|
||||
|
||||
# Write the reports
|
||||
sys.stdout.write("Writing reports to {}...".format(reportdir))
|
||||
sys.stdout.flush()
|
||||
write_report(reportdir, stats)
|
||||
sys.stdout.write("done.\n")
|
||||
# Write the reports
|
||||
sys.stdout.write("Writing reports to {}...".format(reportdir))
|
||||
sys.stdout.flush()
|
||||
RepoAnalyze.write_report(reportdir, stats)
|
||||
sys.stdout.write("done.\n")
|
||||
|
||||
def sanity_check(refs, is_bare):
|
||||
def abort(reason):
|
||||
@ -2506,7 +2524,7 @@ def run_fast_filter():
|
||||
|
||||
# Do analysis, if requested
|
||||
if args.analyze:
|
||||
do_analysis(args, git_dir)
|
||||
RepoAnalyze.run(args, git_dir)
|
||||
return
|
||||
|
||||
# Do sanity checks
|
||||
|
Loading…
Reference in New Issue
Block a user