From e87d073054c72fe964346c507ea1f9c9b0bbd766 Mon Sep 17 00:00:00 2001 From: Jost Migenda Date: Tue, 24 Feb 2026 21:06:57 +0000 Subject: [PATCH] improve timeit examples * vary number of repeats to ensure total runtime stays manageable * remove some ad hoc scaling factors * use total time instead of per-run time * format large numbers with _ for legibility --- ...optimisation-data-structures-algorithms.md | 66 ++++++------ episodes/optimisation-latency.md | 10 +- episodes/optimisation-numpy.md | 102 +++++++++--------- episodes/optimisation-using-python.md | 38 +++---- 4 files changed, 107 insertions(+), 109 deletions(-) diff --git a/episodes/optimisation-data-structures-algorithms.md b/episodes/optimisation-data-structures-algorithms.md index 6a36c43c..37c7aefa 100644 --- a/episodes/optimisation-data-structures-algorithms.md +++ b/episodes/optimisation-data-structures-algorithms.md @@ -89,29 +89,29 @@ from timeit import timeit def list_append(): li = [] - for i in range(100000): + for i in range(100_000): li.append(i) def list_preallocate(): - li = [0]*100000 - for i in range(100000): + li = [0]*100_000 + for i in range(100_000): li[i] = i def list_comprehension(): - li = [i for i in range(100000)] + li = [i for i in range(100_000)] repeats = 1000 -print(f"Append: {timeit(list_append, number=repeats):.2f}ms") -print(f"Preallocate: {timeit(list_preallocate, number=repeats):.2f}ms") -print(f"Comprehension: {timeit(list_comprehension, number=repeats):.2f}ms") +print(f"Append: {timeit(list_append, number=repeats):.2f} s") +print(f"Preallocate: {timeit(list_preallocate, number=repeats):.2f} s") +print(f"Comprehension: {timeit(list_comprehension, number=repeats):.2f} s") ``` `timeit` is used to run each function 1000 times, providing the below averages: ```output -Append: 3.50ms -Preallocate: 2.48ms -Comprehension: 1.69ms +Append: 3.50 s +Preallocate: 2.48 s +Comprehension: 1.69 s ``` Results will vary between Python versions, hardware and list lengths. But in this example list comprehension was 2x faster, with pre-allocate fairing in the middle. Although this is milliseconds, this can soon add up if you are regularly creating lists. @@ -238,7 +238,7 @@ If you reduce the value of `repeats` it will run faster, how does changing the n import random from timeit import timeit -N = 25000 # Number of elements in the list +N = 25_000 # Number of elements in the list data = [random.randint(0, int(N/2)) for i in range(N)] def uniqueSet(): @@ -262,11 +262,11 @@ def uniqueListSort(): if ls_out[-1] != i: ls_out.append(i) -repeats = 1000 -print(f"uniqueSet: {timeit(uniqueSet, number=repeats):.2f}ms") -print(f"uniqueSetAdd: {timeit(uniqueSetAdd, number=repeats):.2f}ms") -print(f"uniqueList: {timeit(uniqueList, number=repeats):.2f}ms") -print(f"uniqueListSort: {timeit(uniqueListSort, number=repeats):.2f}ms") +repeats = 100 +print(f"uniqueSet: {timeit(uniqueSet, number=repeats):.3f} s") +print(f"uniqueSetAdd: {timeit(uniqueSetAdd, number=repeats):.3f} s") +print(f"uniqueList: {timeit(uniqueList, number=repeats):.3f} s") +print(f"uniqueListSort: {timeit(uniqueListSort, number=repeats):.3f} s") ``` :::::::::::::::::::::::: hint @@ -291,10 +291,10 @@ The naive list approach is 2200x times slower than the fastest approach, because Sorting the input list reduces the cost of searching the output list significantly, however it is still 8x slower than the fastest approach. In part because around half of its runtime is now spent sorting the list. ```output -uniqueSet: 0.30ms -uniqueSetAdd: 0.81ms -uniqueList: 660.71ms -uniqueListSort: 2.67ms +uniqueSet: 0.030 s +uniqueSetAdd: 0.081 s +uniqueList: 66.071 s +uniqueListSort: 0.267 s ``` ::::::::::::::::::::::::::::::::: ::::::::::::::::::::::::::::::::::::::::::::::: @@ -316,46 +316,46 @@ import random from timeit import timeit from bisect import bisect_left -N = 25000 # Number of elements in list +N = 25_000 # Number of elements in list M = 2 # N*M == Range over which the elements span -st = set([random.randint(0, int(N*M)) for i in range(N)]) +st = set([random.randint(0, N*M) for i in range(N)]) ls = list(st) ls.sort() # Sort required for binary search def search_set(): j = 0 - for i in range(0, int(N*M), M): + for i in range(0, N*M, M): if i in st: j += 1 def linear_search_list(): j = 0 - for i in range(0, int(N*M), M): + for i in range(0, N*M, M): if i in ls: j += 1 def binary_search_list(): j = 0 - for i in range(0, int(N*M), M): + for i in range(0, N*M, M): k = bisect_left(ls, i) if k != len(ls) and ls[k] == i: j += 1 -repeats = 1000 -print(f"search_set: {timeit(search_set, number=repeats):.2f}ms") -print(f"linear_search_list: {timeit(linear_search_list, number=repeats):.2f}ms") -print(f"binary_search_list: {timeit(binary_search_list, number=repeats):.2f}ms") +repeats = 10 +print(f"search_set: {timeit(search_set, number=repeats):.4f} s") +print(f"linear_search_list: {timeit(linear_search_list, number=repeats):.2f} s") +print(f"binary_search_list: {timeit(binary_search_list, number=repeats):.4f} s") ``` -Searching the set is fastest performing 25,000 searches in 0.57ms. +Searching the set is fastest, performing the task in 5.7 ms. This is followed by the binary search of the (sorted) list which is 6x slower, although the list has been filtered for duplicates. A list still containing duplicates would be longer, leading to a more expensive search. The linear search of the list is about 2700x slower than the fastest, it really shouldn't be used! ```output -search_set: 0.57ms -linear_search_list: 1531.61ms -binary_search_list: 3.43ms +search_set: 0.0057 s +linear_search_list: 15.32 s +binary_search_list: 0.0343 s ``` These results are subject to change based on the number of items and the proportion of searched items that exist within the list. However, the pattern is likely to remain the same. Linear searches should be avoided! diff --git a/episodes/optimisation-latency.md b/episodes/optimisation-latency.md index e89a4f21..81f29b80 100644 --- a/episodes/optimisation-latency.md +++ b/episodes/optimisation-latency.md @@ -47,7 +47,7 @@ The below example code runs a small benchmark, whereby 10MB is written to disk a import os, time # Generate 10MB -data_len = 10000000 +data_len = 10_000_000 data = os.urandom(data_len) file_ct = 1000 file_len = int(data_len/file_ct) @@ -78,12 +78,12 @@ for i in range(file_ct): small_file.close() small_read_s = time.perf_counter() - start # Print Summary -print(f"{1:5d}x{data_len/1000000}MB Write: {large_write_s:.5f} seconds") +print(f"{1:5d}x{data_len/1_000_000}MB Write: {large_write_s:.5f} seconds") print(f"{file_ct:5d}x{file_len/1000}KB Write: {small_write_s:.5f} seconds") -print(f"{1:5d}x{data_len/1000000}MB Read: {large_read_s:.5f} seconds") +print(f"{1:5d}x{data_len/1_000_000}MB Read: {large_read_s:.5f} seconds") print(f"{file_ct:5d}x{file_len/1000}KB Read: {small_read_s:.5f} seconds") -print(f"{file_ct:5d}x{file_len/1000}KB Write was {small_write_s/large_write_s:.1f} slower than 1x{data_len/1000000}MB Write") -print(f"{file_ct:5d}x{file_len/1000}KB Read was {small_read_s/large_read_s:.1f} slower than 1x{data_len/1000000}MB Read") +print(f"{file_ct:5d}x{file_len/1000}KB Write was {small_write_s/large_write_s:.1f} slower than 1x{data_len/1_000_000}MB Write") +print(f"{file_ct:5d}x{file_len/1000}KB Read was {small_read_s/large_read_s:.1f} slower than 1x{data_len/1_000_000}MB Read") # Cleanup os.remove("large.bin") for i in range(file_ct): diff --git a/episodes/optimisation-numpy.md b/episodes/optimisation-numpy.md index 62c1c981..2eda663e 100644 --- a/episodes/optimisation-numpy.md +++ b/episodes/optimisation-numpy.md @@ -37,13 +37,13 @@ It adds restriction via its own [basic numeric types](https://numpy.org/doc/stab NumPy's arrays (not to be confused with the core Python `array` package) are static arrays. Unlike core Python's lists, they do not dynamically resize. Therefore, if you wish to append to a NumPy array, you must call `resize()` first. If you treat this like `append()` for a Python list, resizing for each individual append, you will be performing significantly more copies and memory allocations than a Python list. -The below example sees lists and arrays constructed from `range(100000)`. +The below example sees lists and arrays constructed from `range(100_000)`. ```python from timeit import timeit import numpy -N = 100000 # Number of elements in list/array +N = 100_000 # Number of elements in list/array def list_append(): ls = [] @@ -57,15 +57,15 @@ def array_resize(): ar[i] = i repeats = 1000 -print(f"list_append: {timeit(list_append, number=repeats):.2f}ms") -print(f"array_resize: {timeit(array_resize, number=repeats):.2f}ms") +print(f"list_append: {timeit(list_append, number=repeats):.2f} s") +print(f"array_resize: {timeit(array_resize, number=repeats):.2f} s") ``` For Python lists, we've seen earlier that list comprehensions are more efficient, so we prefer to avoid using a large number of `append` operations if possible. Similarly, we should try to avoid resizing NumPy arrays, where the overhead is even higher (5.2x slower than a list, probably 10x slower than list comprehension). ```output -list_append: 3.50ms -array_resize: 18.04ms +list_append: 3.50 s +array_resize: 18.04 s ``` Another difference, is that NumPy arrays typically require all data to be the same type (and a NumPy type). This enables more efficient access to elements, as they all exist contiguously in memory. In contrast, elements within Python lists can be of any type so the list always stores a pointer to where the element actually exists in memory, rather than the actual element. This has the side effect that if you are converting back and forth between Python lists and NumPy arrays, there is an additional overhead as it's not as simple as copying a single block of memory. @@ -97,11 +97,11 @@ The below example demonstrates the overhead of mixing Python lists and NumPy fun ```sh # Python list, numpy.random.choice() ->python -m timeit -s "import numpy; ls = list(range(10000))" "numpy.random.choice(ls)" +>python -m timeit -s "import numpy; ls = list(range(10_000))" "numpy.random.choice(ls)" 1000 loops, best of 5: 267 usec per loop # NumPy array, numpy.random.choice() ->python -m timeit -s "import numpy; ar = numpy.arange(10000)" "numpy.random.choice(ar)" +>python -m timeit -s "import numpy; ar = numpy.arange(10_000)" "numpy.random.choice(ar)" 50000 loops, best of 5: 4.06 usec per loop ``` @@ -111,11 +111,11 @@ Passing a Python list to `numpy.random.choice()` is 65.6x slower than passing a ```sh # Python list, Manually select 1 item ->python -m timeit -s "import numpy; ls = list(range(10000))" "ls[numpy.random.randint(len(ls))]" +>python -m timeit -s "import numpy; ls = list(range(10_000))" "ls[numpy.random.randint(len(ls))]" 200000 loops, best of 5: 1.19 usec per loop # NumPy array, Manually select 1 item ->python -m timeit -s "import numpy; ar = numpy.arange(10000)" "ar[numpy.random.randint(len(ar))]" +>python -m timeit -s "import numpy; ar = numpy.arange(10_000)" "ar[numpy.random.randint(len(ar))]" 200000 loops, best of 5: 1.22 usec per loop ``` @@ -208,7 +208,7 @@ Added Python sum array, skipped a couple of others--> ```python from timeit import timeit -N = 1000000 # Number of elements in list +N = 1_000_000 # Number of elements in list gen_list = f"ls = list(range({N}))" gen_array = f"import numpy; ar = numpy.arange({N}, dtype=numpy.int64)" @@ -218,24 +218,24 @@ py_sum_ar = "sum(ar*ar)" np_sum_ar = "numpy.sum(ar*ar)" np_dot_ar = "numpy.dot(ar, ar)" -repeats = 1000 -print(f"python_sum_list: {timeit(py_sum_ls, setup=gen_list, number=repeats):.2f}ms") -print(f"python_sum_array: {timeit(py_sum_ar, setup=gen_array, number=repeats):.2f}ms") -print(f"numpy_sum_array: {timeit(np_sum_ar, setup=gen_array, number=repeats):.2f}ms") -print(f"numpy_dot_array: {timeit(np_dot_ar, setup=gen_array, number=repeats):.2f}ms") +repeats = 100 +print(f"python_sum_list: {timeit(py_sum_ls, setup=gen_list, number=repeats):.3f} s") +print(f"python_sum_array: {timeit(py_sum_ar, setup=gen_array, number=repeats):.3f} s") +print(f"numpy_sum_array: {timeit(np_sum_ar, setup=gen_array, number=repeats):.3f} s") +print(f"numpy_dot_array: {timeit(np_dot_ar, setup=gen_array, number=repeats):.3f} s") ``` ```output -python_sum_list: 46.93ms -python_sum_array: 33.26ms -numpy_sum_array: 1.44ms -numpy_dot_array: 0.29ms +python_sum_list: 4.693 s +python_sum_array: 3.326 s +numpy_sum_array: 0.144 s +numpy_dot_array: 0.029 s ``` -* `python_sum_list` uses list comprehension to perform the multiplication, followed by the Python core `sum()`. This comes out at 46.93ms -* `python_sum_array` instead directly multiplies the two arrays (taking advantage of NumPy's vectorisation) but uses the core Python `sum()`, this comes in slightly faster at 33.26ms. -* `numpy_sum_array` again takes advantage of NumPy's vectorisation for the multiplication, and additionally uses NumPy's `sum()` implementation. These two rounds of vectorisation provide a much faster 1.44ms completion. -* `numpy_dot_array` instead uses NumPy's `dot()` to calculate the dot product in a single operation. This comes out the fastest at 0.29ms, 162x faster than `python_sum_list`. +* `python_sum_list` uses list comprehension to perform the multiplication, followed by the Python core `sum()`. This comes out at 4.693 s +* `python_sum_array` instead directly multiplies the two arrays (taking advantage of NumPy's vectorisation) but uses the core Python `sum()`, this comes in slightly faster. +* `numpy_sum_array` again takes advantage of NumPy's vectorisation for the multiplication, and additionally uses NumPy's `sum()` implementation. These two rounds of vectorisation provide a roughly 30x faster completion. +* `numpy_dot_array` instead uses NumPy's `dot()` to calculate the dot product in a single operation. This comes out the fastest, 162x faster than `python_sum_list`. ::::::::::::::::::::::::::::::::::::: callout @@ -301,7 +301,7 @@ for polygon_idx in range(n_polygons): points_per_polygon[polygon_idx] = out_points ``` -For about 500k points and 1000 polygons, the initial version of the code took about 20 hours to run. +For about 500,000 points and 1000 polygons, the initial version of the code took about 20 hours to run. Luckily, Shapely is built on top of NumPy, so she was able to apply functions to an array of points instead and wrote an improved version, which took just 20 minutes: @@ -387,7 +387,7 @@ from timeit import timeit import pandas import numpy -N = 100000 # Number of rows in DataFrame +N = 100_000 # Number of rows in DataFrame def genDataFrame(): numpy.random.seed(12) # Ensure each dataframe is identical @@ -400,7 +400,7 @@ def genDataFrame(): def pythagoras(row): return (row["f_vertical"]**2 + row["f_horizontal"]**2)**0.5 - + def for_range(): rtn = [] df = genDataFrame() @@ -415,25 +415,25 @@ def for_iterrows(): for row_idx, row in df.iterrows(): rtn.append(pythagoras(row)) return pandas.Series(rtn) - + def pandas_apply(): df = genDataFrame() return df.apply(pythagoras, axis=1) -repeats = 1000 +repeats = 10 gentime = timeit(genDataFrame, number=repeats) -print(f"for_range: {timeit(for_range, number=int(repeats/20))*20-gentime:.2f}ms") # scale with factor 20, otherwise it takes too long -print(f"for_iterrows: {timeit(for_iterrows, number=int(repeats/20))*20-gentime:.2f}ms") -print(f"pandas_apply: {timeit(pandas_apply, number=int(repeats/20))*20-gentime:.2f}ms") +print(f"for_range: {timeit(for_range, number=repeats)-gentime:.2f} s") +print(f"for_iterrows: {timeit(for_iterrows, number=repeats)-gentime:.2f} s") +print(f"pandas_apply: {timeit(pandas_apply, number=repeats)-gentime:.2f} s") ``` `apply()` is 4x faster than the two `for` approaches, as it avoids the Python `for` loop. ```output -for_range: 1582.47ms -for_iterrows: 1677.14ms -pandas_apply: 390.49ms +for_range: 15.82 s +for_iterrows: 16.77 s +pandas_apply: 3.90 s ``` However, rows don't exist in memory as arrays (columns do!), so `apply()` does not take advantage of NumPy's vectorisation. You may be able to go a step further and avoid explicitly operating on rows entirely by passing only the required columns to NumPy. @@ -456,9 +456,7 @@ def vectorize(): Once you’ve done that, measure your performance by running ```python -repeats = 1000 -gentime = timeit(genDataFrame, number=repeats) -print(f"vectorize: {timeit(vectorize, number=repeats)-gentime:.2f}ms") +print(f"vectorize: {timeit(vectorize, number=repeats)-gentime:.3f} s") ``` What result do you find? Does this match your expectations? @@ -488,13 +486,13 @@ def vectorize(): return pandas.Series(result) -print(f"vectorize: {timeit(vectorize, number=repeats)-gentime:.2f}ms") +print(f"vectorize: {timeit(vectorize, number=repeats)-gentime:.3f} s") ``` 264x faster than `apply()`, 1000x faster than the two `for` approaches! -``` -vectorize: 1.48ms +```output +vectorize: 0.015 s ``` ::::::::::::::::::::::::::::::::: @@ -512,22 +510,22 @@ def to_dict(): df_as_dict = df.to_dict(orient='index') return pandas.Series([(r['f_vertical']**2 + r['f_horizontal']**2)**0.5 for r in df_as_dict.values()]) -print(f"to_dict: {timeit(to_dict, number=repeats)*10-gentime:.2f}ms") +print(f"to_dict: {timeit(to_dict, number=repeats)-gentime:.2f} s") ``` -Whilst still nearly 100x slower than pure vectorisation, it's twice as fast as `apply()`. +Whilst still nearly 100x slower than pure vectorisation, it's three times as fast as `apply()`. -```sh -to_dict: 131.15ms +```output +to_dict: 1.31 s ``` -This is because indexing into Pandas' `Series` (rows) is significantly slower than a Python dictionary. There is a slight overhead to creating the dictionary (40ms in this example), however the stark difference in access speed is more than enough to overcome that cost for any large DataFrame. +This is because indexing into Pandas' `Series` (rows) is significantly slower than a Python dictionary. There is a slight overhead to creating the dictionary, however the stark difference in access speed is more than enough to overcome that cost for any large DataFrame. ```python from timeit import timeit import pandas as pandas -N = 100000 # Number of rows in DataFrame +N = 100_000 # Number of rows in DataFrame def genInput(): s = pandas.Series({'a' : 1, 'b' : 2}) @@ -544,16 +542,16 @@ def dictionary(): for i in range(N): y = d['a'] * d['b'] -repeats = 1000 -print(f"series: {timeit(series, number=repeats):.2f}ms") -print(f"dictionary: {timeit(dictionary, number=repeats):.2f}ms") +repeats = 100 +print(f"series: {timeit(series, number=repeats):.3f} s") +print(f"dictionary: {timeit(dictionary, number=repeats):.3f} s") ``` 65x slower! ```output -series: 237.25ms -dictionary: 3.63ms +series: 23.725 s +dictionary: 0.363 s ``` ### Filter Early diff --git a/episodes/optimisation-using-python.md b/episodes/optimisation-using-python.md index b38b3933..d8f25eb9 100644 --- a/episodes/optimisation-using-python.md +++ b/episodes/optimisation-using-python.md @@ -39,7 +39,7 @@ For example, you might think to sum a list of numbers by using a for loop, as wo import random from timeit import timeit -N = 100000 # Number of elements in the list +N = 100_000 # Number of elements in the list # Ensure every list is the same random.seed(12) @@ -63,18 +63,18 @@ def builtinSum(): repeats = 1000 -print(f"manualSumC: {timeit(manualSumC, globals=globals(), number=repeats):.3f}ms") -print(f"manualSumPy: {timeit(manualSumPy, globals=globals(), number=repeats):.3f}ms") -print(f"builtinSum: {timeit(builtinSum, globals=globals(), number=repeats):.3f}ms") +print(f"manualSumC: {timeit(manualSumC, globals=globals(), number=repeats):.3f} s") +print(f"manualSumPy: {timeit(manualSumPy, globals=globals(), number=repeats):.3f} s") +print(f"builtinSum: {timeit(builtinSum, globals=globals(), number=repeats):.3f} s") ``` Even just replacing the iteration over indices (which may be a habit you’ve picked up if you first learned to program in C) with a more pythonic iteration over the elements themselves speeds up the code by about 2x. But even better, by switching to the built-in `sum()` function our code becomes about 8x faster and much easier to read while doing the exact same operation! ```output -manualSumC: 1.624ms -manualSumPy: 0.740ms -builtinSum: 0.218ms +manualSumC: 1.624 s +manualSumPy: 0.740 s +builtinSum: 0.218 s ``` This is because [built-in functions](https://docs.python.org/3/library/functions.html) (i.e. those that are available without importing packages) are typically implemented in the CPython back-end, so their performance benefits from bypassing the Python interpreter. @@ -115,11 +115,11 @@ from timeit import timeit N = 2500 # Number of elements in list M = 2 # N*M == Range over which the elements span -ls = [random.randint(0, int(N*M)) for i in range(N)] +ls = [random.randint(0, N*M) for i in range(N)] def manualSearch(): count = 0 - for even_number in range(0, int(N*M), M): + for even_number in range(0, N*M, M): for i in range(0, len(ls)): if ls[i] == even_number: count += 1 @@ -127,20 +127,20 @@ def manualSearch(): def operatorSearch(): count = 0 - for even_number in range(0, int(N*M), M): + for even_number in range(0, N*M, M): if even_number in ls: count += 1 -repeats = 1000 -print(f"manualSearch: {timeit(manualSearch, number=repeats):.2f}ms") -print(f"operatorSearch: {timeit(operatorSearch, number=repeats):.2f}ms") +repeats = 100 +print(f"manualSearch: {timeit(manualSearch, number=repeats):.2f} s") +print(f"operatorSearch: {timeit(operatorSearch, number=repeats):.2f} s") ``` This results in the manual Python implementation being 5x slower, doing the exact same operation! ```output -manualSearch: 152.15ms -operatorSearch: 28.43ms +manualSearch: 15.22 s +operatorSearch: 2.84 s ``` An easy approach to follow is that if two blocks of code do the same operation, the one that contains less Python is probably faster. This won't apply if you're using 3rd party packages written purely in Python though. @@ -221,13 +221,13 @@ random.seed(12) f = [f" {i:0>6d} {random.random():8.4f} " for i in range(N)] repeats = 1000 -print(f"manualSplit: {timeit(manualSplit, globals=globals(), number=repeats):.3f}ms") -print(f"builtinSplit: {timeit(builtinSplit, globals=globals(), number=repeats):.3f}ms") +print(f"manualSplit: {timeit(manualSplit, globals=globals(), number=repeats):.3f} s") +print(f"builtinSplit: {timeit(builtinSplit, globals=globals(), number=repeats):.3f} s") ``` ```output -manualSplit: 1.797ms -builtinSplit: 0.796ms +manualSplit: 1.797 s +builtinSplit: 0.796 s ``` :::::::::::::::::::::::::::::::::