Java/src/main/java/com/thealgorithms/compression/ShannonFano.java at master · Shewale41/Java · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
package com.thealgorithms.compression;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

/**
 * An implementation of the Shannon-Fano algorithm for generating prefix codes.
 *
 * <p>Shannon-Fano coding is an entropy encoding technique for lossless data
 * compression. It assigns variable-length codes to symbols based on their
 * frequencies of occurrence. It is a precursor to Huffman coding and works by
 * recursively partitioning a sorted list of symbols into two sub-lists with
 * nearly equal total frequencies.
 *
 * <p>The algorithm works as follows:
 * <ol>
 * <li>Count the frequency of each symbol in the input data.</li>
 * <li>Sort the symbols in descending order of their frequencies.</li>
 * <li>Recursively divide the list of symbols into two parts with sums of
 * frequencies as close as possible to each other.</li>
 * <li>Assign a '0' bit to the codes in the first part and a '1' bit to the codes
 * in the second part.</li>
 * <li>Repeat the process for each part until a part contains only one symbol.</li>
 * </ol>
 *
 * <p>Time Complexity: O(n^2) in this implementation due to the partitioning logic,
 * or O(n log n) if a more optimized partitioning strategy is used.
 * Sorting takes O(n log n), where n is the number of unique symbols.
 *
 * <p>References:
 * <ul>
 * <li><a href="https://en.wikipedia.org/wiki/ShannonÃ¢â‚¬"Fano_coding">Wikipedia: ShannonÃ¢â‚¬"Fano coding</a></li>
 * </ul>
 */
public final class ShannonFano {

    /**
     * Private constructor to prevent instantiation of this utility class.
     */
    private ShannonFano() {
    }

    /**
     * A private inner class to represent a symbol and its frequency.
     * Implements Comparable to allow sorting based on frequency.
     */
    private static class Symbol implements Comparable<Symbol> {
        final char character;
        final int frequency;
        String code = "";

        Symbol(char character, int frequency) {
            this.character = character;
            this.frequency = frequency;
        }

        @Override
        public int compareTo(Symbol other) {
            return Integer.compare(other.frequency, this.frequency); // Sort descending
        }
    }

    /**
     * Generates Shannon-Fano codes for the symbols in a given text.
     *
     * @param text The input string for which to generate codes. Must not be null.
     * @return A map where keys are characters and values are their corresponding Shannon-Fano codes.
     */
    public static Map<Character, String> generateCodes(String text) {
        if (text == null || text.isEmpty()) {
            return Collections.emptyMap();
        }

        Map<Character, Integer> frequencyMap = new HashMap<>();
        for (char c : text.toCharArray()) {
            frequencyMap.put(c, frequencyMap.getOrDefault(c, 0) + 1);
        }

        List<Symbol> symbols = new ArrayList<>();
        for (Map.Entry<Character, Integer> entry : frequencyMap.entrySet()) {
            symbols.add(new Symbol(entry.getKey(), entry.getValue()));
        }

        Collections.sort(symbols);

        // Special case: only one unique symbol
        if (symbols.size() == 1) {
            symbols.getFirst().code = "0";
        } else {
            buildCodeTree(symbols, 0, symbols.size() - 1, "");
        }

        return symbols.stream().collect(Collectors.toMap(s -> s.character, s -> s.code));
    }

    /**
     * Recursively builds the Shannon-Fano code tree by partitioning the list of symbols.
     * Uses index-based approach to avoid sublist creation issues.
     *
     * @param symbols The sorted list of symbols to be processed.
     * @param start   The start index of the current partition.
     * @param end     The end index of the current partition (inclusive).
     * @param prefix  The current prefix code being built for the symbols in this partition.
     */
    private static void buildCodeTree(List<Symbol> symbols, int start, int end, String prefix) {
        // The initial check in generateCodes ensures start <= end is always true here.
        // The base case is when a partition has only one symbol.
        if (start == end) {
            symbols.get(start).code = prefix;
            return;
        }

        // Find the optimal split point
        int splitIndex = findSplitIndex(symbols, start, end);

        // Recursively process left and right partitions with updated prefixes
        buildCodeTree(symbols, start, splitIndex, prefix + "0");
        buildCodeTree(symbols, splitIndex + 1, end, prefix + "1");
    }

    /**
     * Finds the index that splits the range into two parts with the most balanced frequency sums.
     * This method tries every possible split point and returns the index that minimizes the
     * absolute difference between the two partition sums.
     *
     * @param symbols The sorted list of symbols.
     * @param start   The start index of the range.
     * @param end     The end index of the range (inclusive).
     * @return The index of the last element in the first partition.
     */
    private static int findSplitIndex(List<Symbol> symbols, int start, int end) {
        // Calculate total frequency for the entire range
        long totalFrequency = 0;
        for (int i = start; i <= end; i++) {
            totalFrequency += symbols.get(i).frequency;
        }

        long leftSum = 0;
        long minDifference = Long.MAX_VALUE;
        int splitIndex = start;

        // Try every possible split point and find the one with minimum difference
        for (int i = start; i < end; i++) {
            leftSum += symbols.get(i).frequency;
            long rightSum = totalFrequency - leftSum;
            long difference = Math.abs(leftSum - rightSum);

            if (difference < minDifference) {
                minDifference = difference;
                splitIndex = i;
            }
        }
        return splitIndex;
    }
}