Sorting algorithms. Comparison of performances

One of the most crucial operations in computer science is to sort a given list, that is to reorder its elements, such that every item is greater or equal to the previous one. Many sorting algorithms have been developed throughout the time and we can assess which ones are better than others using a metric based on their computational complexity. This core characteristic comes in two flavours: time complexity, or how much longer will the algorithm run when we increase the size of the list, and space complexity, or how much more resources it will use. This project is an implementation of chosen sorting algorithms that compares their individual runtimes against the same datasets to observe their time complexity in action.

Visit the project repository here!


            """Implementation of popular sorting algorithms with automated testing."""
            import time
            import random
            import pandas as pd
            from statistics import median


            class FileLoader():
                """Static class for loading integers or words from a .txt file."""

                @staticmethod
                def load_num(filename, lines):
                    """Return a list of given number of integers from a file."""
                    file = open('data/' + filename, 'r')
                    return [int(line) for line in random.sample(file.readlines(), lines)]

                @staticmethod
                def load_str(filename, words):
                    """Return a list of given number of words from a file."""
                    file = open('data/' + filename, 'r')
                    return [word[:-1] for word in random.sample(file.readlines(), words)]


            class Sorter():
                """Static class wrapping all sorting methods."""

                @classmethod
                def _bin_search(cls, arr, start, end, key):
                    """Return the index of the key in the array."""
                    # Subarray is one element - return the index
                    if start == end:
                        return start
                        # Calculate the index in between
                    mid = start + int((end-start) / 2)
                    # The searched value is either in the left or right subarray
                    if key > arr[mid]:
                        return cls._bin_search(arr, mid+1, end, key)
                    if key < arr[mid]:
                        return cls._bin_search(arr, start, mid, key)
                    # If the searched value is exactly at mid index, return it
                    return mid

                @staticmethod
                def is_sorted(arr):
                    """Return whether the passed array is sorted or not."""
                    for i in range(len(arr)-1):
                        if arr[i] > arr[i+1]:
                            return False
                    return True

                @staticmethod
                def selection_sort(arr):
                    """Return the array sorted using selection sort."""
                    # For each successive element in the array, swap it
                    # with the minimal element in the rest of the array.
                    arr = arr.copy()
                    for i in range(len(arr)):
                        i_min = i
                        for j in range(i+1, len(arr)):
                            if arr[j] < arr[i_min]:
                                i_min = j
                        arr[i], arr[i_min] = arr[i_min], arr[i]
                    return arr

                @staticmethod
                def insertion_sort(arr):
                    """Return the array sorted using insertion sort."""
                    # Place each element from the second one onwards
                    # in proper place by shifting it to the left until
                    # it's the first element or the next element is not greater.
                    arr = arr.copy()
                    for i in range(1, len(arr)):
                        ins = arr[i]
                        j = i - 1
                        while j > 0 and ins < arr[j]:
                            arr[j+1] = arr[j]
                            j -= 1
                        arr[j] = ins
                    return arr

                @classmethod
                def binary_insertion_sort(cls, arr):
                    """Return the array sorted using selection sort with binary search."""
                    # Insertion sort, but the place to insert each element
                    # is found using binary search.
                    arr = arr.copy()
                    for i in range(1, len(arr)):
                        ins = arr[i]
                        # Index for the item to be placed is
                        # found by binary search
                        target = cls._bin_search(arr, 0, i, ins)
                        j = i - 1
                        while j >= target:
                            arr[j+1] = arr[j]
                            j -= 1
                        arr[target] = ins
                    return arr

                @staticmethod
                def bubble_sort(arr):
                    """Return the array sorted using bubble sort."""
                    # Take each element and keep moving it to the right,
                    # until it reaches the end or greater element.
                    # O(n^2) complexity.
                    arr = arr.copy()
                    top = len(arr) - 1
                    for _ in range(len(arr)):
                        for j in range(top):
                            if arr[j] > arr[j+1]:
                                arr[j], arr[j+1] = arr[j+1], arr[j]
                        top -= 1
                    return arr

                @staticmethod
                def coctail_sort(arr):
                    """Return the array sorted using coctail sort."""
                    arr = arr.copy()
                    bottom = 0
                    top = len(arr) - 1
                    for _ in range(int(len(arr)/2)):
                        # We fix the largest element on the right
                        for j in range(bottom, top):
                            if arr[j] > arr[j+1]:
                                arr[j], arr[j+1] = arr[j+1], arr[j]
                        top -= 1

                        # We fix the smallest element on the left
                        for j in range(top, bottom, -1):
                            if arr[j] < arr[j-1]:
                                arr[j], arr[j-1] = arr[j-1], arr[j]
                        bottom += 1
                    return arr

                # Returns the list of step sizes for subarrays selection in shell sort.
                @staticmethod
                def _get_h_values(subarrays_type, arr):
                    h_values = []
                    # Set the separation values
                    if subarrays_type == 'halves':
                        h = int(len(arr)/2)
                        while h >= 1:
                            h_values.append(h)
                            h = int(h / 2)
                        h_values.reverse()
                    elif subarrays_type == 'thirds':
                        h = int(len(arr)/3)
                        while h > 1:
                            h_values.append(h)
                            h = int(h / 3)
                        h_values.append(1)
                        h_values.reverse()
                    elif subarrays_type == 'sedgewick':
                        h_values = [4**k + 3*2**(k-1) + 1 for k in range(12)
                                    if 4**k + 3*2**(k-1) + 1 < len(arr)]
                        h_values[0] = 1
                    return h_values

                @classmethod
                def shell_sort(cls, arr, subarrays_type='halves'):
                    """
                    Return the array sorted using shell sort.

                    type:
                    'halves', 'thirds', 'sedgewick'
                    Specify the method of selecting subarray sizes (the size of the step
                    when taking consecutive subarrays can be taken from multiple sources;
                    subarrays with different spread yield different complexities).
                    """
                    arr = arr.copy()
                    h_values = cls._get_h_values(subarrays_type, arr)
                    h_index = len(h_values) - 1
                    while h_index >= 0:
                        h = h_values[h_index]
                        # Like insertion sort, but on subarrays
                        for i in range(h, len(arr)):
                            ins = arr[i]
                            j = i
                            # Shift whole subarray until 'ins' is in proper position
                            while j >= h and ins < arr[j-h]:
                                arr[j] = arr[j-h]
                                j -= h
                            # Plug the considered element in proper position
                            arr[j] = ins
                        h_index -= 1
                    return arr

                # Returns the index of the pivot element for quick sort.
                @staticmethod
                def _select_pivot(piv_selection, arr):
                    if piv_selection == 'first':
                        # Select the first element as the pivot
                        pivot_index = 0
                    elif piv_selection == 'median':
                        # Rouge case check
                        if len(arr) == 2:
                            pivot_index = 0
                        else:
                            # Find the index of the median of three random elements
                            three = random.sample(range(len(arr)), 3)
                            for i in three:
                                if arr[i] == median(list(map(lambda x: arr[x], three))):
                                    pivot_index = i
                                    break
                    elif piv_selection == 'random':
                        # Select a random element as the pivot
                        pivot_index = random.randint(0, len(arr))
                    return pivot_index

                @classmethod
                def quick_sort(cls, arr, piv_selection='first'):
                    """
                    Return the array sorted using quick sort.

                    piv_selection:
                    'first', 'median', 'random'
                    Specify the selection of the pivot element from the unsorted array.
                    """
                    arr = arr.copy()
                    # One-item array is already sorted
                    if len(arr) <= 1:
                        return arr
                    # Select appropriate pivot element, as specified in function argument.
                    pivot_index = cls._select_pivot(piv_selection, arr)
                    # Swap pivot with the first element
                    arr[0], arr[pivot_index] = arr[pivot_index], arr[0]
                    pivot = arr[0]
                    s = 0
                    for i in range(1, len(arr)):
                        # Put all elements smaller than pivot next to it (at '++s' index)
                        if arr[i] < pivot:
                            s += 1
                            arr[s], arr[i] = arr[i], arr[s]
                    # Swap the last swapped element with the pivot.
                    # Now all items smaller than the pivot are on the left,
                    # and all items greater than the pivot are on the right.
                    arr[s], arr[0] = arr[0], arr[s]
                    # Repeat the process for the subarray on the left of the pivot
                    # and the one on the right of the pivot.
                    arr[:s] = cls.quick_sort(arr[:s])
                    arr[s+1:] = cls.quick_sort(arr[s+1:])
                    return arr

                @classmethod
                def quick_sort_insertion(cls, arr):
                    """Return the array sorted using quick sort with insertion sort."""
                    arr = arr.copy()
                    # One-item array is already sorted
                    if len(arr) <= 1:
                        return arr
                    # Small enough arrays can be dealt with using binary insertion sort
                    if len(arr) <= 4:
                        return cls.binary_insertion_sort(arr)
                    # Select the first element as the pivot
                    pivot = arr[0]
                    s = 0
                    for i in range(1, len(arr)):
                        # Put all elements smaller than pivot next to it (at '++s' index)
                        if arr[i] < pivot:
                            s += 1
                            arr[s], arr[i] = arr[i], arr[s]
                    # Swap the latest smaller element with the pivot.
                    # Now all items smaller than the pivot are on the left,
                    # and all items greater than the pivot are on the right.
                    arr[s], arr[0] = arr[0], arr[s]
                    # Repeat the process for the subarray on the left of the pivot
                    # and the one on the right of the pivot.
                    arr[:s] = cls.quick_sort_insertion(arr[:s])
                    arr[s+1:] = cls.quick_sort_insertion(arr[s+1:])
                    return arr


            class Tester():
                """Class for testing the performance of various sorting algorithms."""

                def __init__(self, data_type):
                    """
                    Create an instance of the class suited to a given data type.

                    data_type:
                    int, str, float
                    """
                    self.arr = []
                    self.data_type = data_type
                    self.arr_size = 0

                def prepare_array(self, arr_size):
                    """Import the specified number of speficied objects."""
                    self.arr_size = arr_size
                    if self.data_type == int:
                        self.arr = FileLoader.load_num('data.txt', arr_size)
                    elif self.data_type == str:
                        self.arr = FileLoader.load_str('words.txt', arr_size)
                    elif self.data_type == float:
                        self.arr = list(map(lambda x: x * random.random(),
                                            FileLoader.load_num('data.txt', arr_size)))
                    # Output a description of the loaded array.
                    print(f'\nArray size: {arr_size}')
                    print(f'Data type: {self.data_type}')
                    print('Status: '
                          + f'{"sorted" if Sorter.is_sorted(self.arr) else "unsorted"}')

                def perform_test(self, sorting_alg):
                    """Sort the created array using specified sorting algorithm."""
                    # Tidy up and output the name of the tested algorithm.
                    alg_name = sorting_alg.capitalize().replace('sort', 'sort,').strip(',')
                    print(f'\n=== {alg_name} ===')

                    # Select the chosen algorithm and perform the sorting.
                    start = time.perf_counter()
                    if sorting_alg == 'selection sort':
                        result = Sorter.selection_sort(self.arr)
                    elif sorting_alg == 'insertion sort':
                        result = Sorter.insertion_sort(self.arr)
                    elif sorting_alg == 'binary insertion sort':
                        result = Sorter.binary_insertion_sort(self.arr)
                    elif sorting_alg == 'bubble sort':
                        result = Sorter.bubble_sort(self.arr)
                    elif sorting_alg == 'coctail sort':
                        result = Sorter.coctail_sort(self.arr)
                    elif sorting_alg == 'shell sort halves':
                        result = Sorter.shell_sort(self.arr)
                    elif sorting_alg == 'shell sort thirds':
                        result = Sorter.shell_sort(self.arr, 'thirds')
                    elif sorting_alg == 'shell sort sedgewick':
                        result = Sorter.shell_sort(self.arr, 'sedgewick')
                    elif sorting_alg == 'quick sort first':
                        result = Sorter.quick_sort(self.arr)
                    elif sorting_alg == 'quick sort median':
                        result = Sorter.quick_sort(self.arr, 'median')
                    elif sorting_alg == 'quick sort random':
                        result = Sorter.quick_sort(self.arr, 'random')
                    elif sorting_alg == 'quick sort insertion':
                        result = Sorter.quick_sort_insertion(self.arr)
                    stop = time.perf_counter()
                    # Save and output the time of the algorithm and the state of the array.
                    performance = round(stop-start, 5)
                    print(f'Elapsed time: {performance} seconds')
                    print('Status: '
                          + f'{"sorted" if Sorter.is_sorted(result) else "unsorted"}')
                    # Return the pair: the name of the algorithm and its performance time.
                    return (alg_name, performance)

                def summarize_performance(self, group='all'):
                    """Present the comparison of performances of each algorithm."""
                    # Define and store all algorithm names to be recognized by the tester.
                    alg_names = ['selection sort',
                                 'insertion sort',
                                 'binary insertion sort',
                                 'bubble sort',
                                 'coctail sort',
                                 'shell sort halves',
                                 'shell sort thirds',
                                 'shell sort sedgewick',
                                 'quick sort first',
                                 'quick sort median',
                                 'quick sort random',
                                 'quick sort insertion']
                    # Determine the subclass of the algorithms.
                    if group == 'all':
                        tested_algs = alg_names
                        sizes = [1000, 2000, 4000, 8000, 16000]
                    elif group == 'quick':
                        tested_algs = alg_names[5:]
                        sizes = [16000, 32000, 64000, 128000, 256000, 512000, 1024000]
                    elif group == 'basic':
                        sizes = [1000, 2000, 4000, 8000, 16000]
                        tested_algs = alg_names[:5]
                    # Create a dataframe to hold the performances on different array sizes.
                    timetable = pd.DataFrame().rename_axis(
                        index='Algorithm', columns='Data size')
                    # Iterate over orders of magnitude of the array size.
                    for test_arr_size in sizes:
                        summary = pd.Series(name=test_arr_size)
                        self.prepare_array(test_arr_size)
                        for name in tested_algs:
                            alg_name, perf_time = int_tester.perform_test(name)
                            summary[alg_name] = perf_time
                        timetable = pd.concat([timetable, summary], axis=1)
                    return timetable


            if __name__ == "__main__":
                # Create an integer sorting tester...
                int_tester = Tester(int)
                # ...specify the testing data size...
                int_tester.prepare_array(5000)
                # ...and perform tests
                int_tester.perform_test('insertion sort')
                int_tester.perform_test('binary insertion sort')
                # Save results for later...
                algorithm, performance_time = int_tester.perform_test('selection sort')
                print(f'{algorithm}: {performance_time} seconds')
                # ...or let the code do all the work for you!
                print(int_tester.summarize_performance('all'))