{- |
Description: Huffman codes
Copyright: Copyright (C) 2021 Yoo Chung
License: GPL-3.0-or-later
Maintainer: dev@chungyc.org

Part of Ninety-Nine Haskell "Problems".  Some solutions are in "Solutions.P50".
-}
module Problems.P50
  ( huffman
    -- * Supporting functions
    -- | The functions below are not part of the problem.
    -- Instead, they are used to illustrate the use of Huffman coding.
  , countCharacters
  , encodeHuffman
  , decodeHuffman
  , loweralpha
  , ascii
  , text
  ) where

import           Data.List     (group, sort)
import           Data.Map.Lazy (Map)
import qualified Data.Map.Lazy as Map
import           Data.Maybe    (mapMaybe)
import qualified Solutions.P50 as Solution

{- |
Given a list of characters and their number of occurrences,
construct a list of the characters and their [Huffman encoding](https://brilliant.org/wiki/huffman-encoding/).

In the encoding, @'0'@ and @'1'@ will denote the 0 and 1 bits.

=== Examples

>>> huffman [('a',45),('b',13),('c',12),('d',16),('e',9),('f',5)]
[('a',"0"),('b',"101"),('c',"100"),('d',"111"),('e',"1101"),('f',"1100")]

The encoding table computed by 'huffman' can be used to compress data:

>>> length $ encodeHuffman (huffman $ countCharacters text) text
3552

Compare this to the length of a fixed-length 5-bit encoding:

>>> length $ encodeHuffman loweralpha text
4375

or the length of the more standard ASCII encoding with 8 bits:

>>> length $ encodeHuffman ascii text
7000

Huffman coding is unambiguous, so we can get back the original text:

>>> let table = huffman $ countCharacters text
>>> let encodedText = encodeHuffman table text
>>> let decodedText = decodeHuffman table encodedText
>>> decodedText == text
True
-}
huffman :: [(Char,Int)] -> [(Char,String)]
huffman :: [(Char, Int)] -> [(Char, String)]
huffman = [(Char, Int)] -> [(Char, String)]
Solution.huffman

-- | Count the number of occurrences of a character in a string.
countCharacters :: String -> [(Char,Int)]
countCharacters :: String -> [(Char, Int)]
countCharacters String
s = (String -> Maybe (Char, Int)) -> [String] -> [(Char, Int)]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe String -> Maybe (Char, Int)
forall {a}. [a] -> Maybe (a, Int)
count ([String] -> [(Char, Int)]) -> [String] -> [(Char, Int)]
forall a b. (a -> b) -> a -> b
$ String -> [String]
forall a. Eq a => [a] -> [[a]]
group (String -> [String]) -> String -> [String]
forall a b. (a -> b) -> a -> b
$ String -> String
forall a. Ord a => [a] -> [a]
sort String
s
  where count :: [a] -> Maybe (a, Int)
count [] = Maybe (a, Int)
forall a. Maybe a
Nothing
        count xs :: [a]
xs@(a
x:[a]
_) = (a, Int) -> Maybe (a, Int)
forall a. a -> Maybe a
Just (a
x, [a] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [a]
xs)

-- | Given an encoding table and a string, encode the string.
--
-- While this is intended for use in illustrating Huffman coding, it is not limited to such.
-- In particular, it can encode with a fixed-length encoding table.
encodeHuffman :: [(Char,String)] -> String -> String
encodeHuffman :: [(Char, String)] -> String -> String
encodeHuffman [(Char, String)]
table String
string = Map Char String -> String -> String -> String
encodeHuffman' ([(Char, String)] -> Map Char String
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList [(Char, String)]
table) String
string String
""

encodeHuffman' :: Map Char String -> String -> String -> String
encodeHuffman' :: Map Char String -> String -> String -> String
encodeHuffman' Map Char String
_ String
"" String
encoded = String -> String
forall a. [a] -> [a]
reverse String
encoded
encodeHuffman' Map Char String
table (Char
c:String
string) String
encoded =
  case Char -> Map Char String -> Maybe String
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Char
c Map Char String
table of
    Maybe String
Nothing -> String
forall a. HasCallStack => a
undefined
    Just String
e  -> Map Char String -> String -> String -> String
encodeHuffman' Map Char String
table String
string (String -> String) -> String -> String
forall a b. (a -> b) -> a -> b
$ String -> String
forall a. [a] -> [a]
reverse String
e String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
encoded

-- | Given an encoding table and a string, decode the string.
--
-- While this is intended for use in illustrating Huffman coding, it is not limited to such.
-- In particular, it can decode with a fixed-length encoding table.
decodeHuffman :: [(Char,String)] -> String -> String
decodeHuffman :: [(Char, String)] -> String -> String
decodeHuffman [(Char, String)]
table String
string = Map String Char -> String -> String -> String -> String
decodeHuffman' ([(String, Char)] -> Map String Char
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(String, Char)] -> Map String Char)
-> [(String, Char)] -> Map String Char
forall a b. (a -> b) -> a -> b
$ ((Char, String) -> (String, Char))
-> [(Char, String)] -> [(String, Char)]
forall a b. (a -> b) -> [a] -> [b]
map (\(Char
x,String
y) -> (String -> String
forall a. [a] -> [a]
reverse String
y, Char
x)) [(Char, String)]
table) String
"" String
string String
""

decodeHuffman' :: Map String Char -> String -> String -> String -> String
decodeHuffman' :: Map String Char -> String -> String -> String -> String
decodeHuffman' Map String Char
_ String
"" String
"" String
decoded = String -> String
forall a. [a] -> [a]
reverse String
decoded
decodeHuffman' Map String Char
table String
code String
encoded String
decoded =
  case String -> Map String Char -> Maybe Char
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup String
code Map String Char
table of
    Maybe Char
Nothing -> case String
encoded of
      (Char
c:String
encoded') -> Map String Char -> String -> String -> String -> String
decodeHuffman' Map String Char
table (Char
cChar -> String -> String
forall a. a -> [a] -> [a]
:String
code) String
encoded' String
decoded
      String
""           -> String
forall a. HasCallStack => a
undefined
    Just Char
c -> Map String Char -> String -> String -> String -> String
decodeHuffman' Map String Char
table String
"" String
encoded (Char
cChar -> String -> String
forall a. a -> [a] -> [a]
:String
decoded)

-- | Fixed-length encoding of lower case letters and a space using 5 bits.
loweralpha :: [(Char,String)]
loweralpha :: [(Char, String)]
loweralpha = [Char -> (Char, String)
forall {a}. Enum a => a -> (a, String)
encode Char
c | Char
c <- Char
' ' Char -> String -> String
forall a. a -> [a] -> [a]
: [Char
'a'..Char
'z']]
  where encode :: a -> (a, String)
encode a
c = (a
c, Int -> String
toBits (Int -> String) -> Int -> String
forall a b. (a -> b) -> a -> b
$ a -> Int
forall a. Enum a => a -> Int
fromEnum a
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
forall a. Enum a => a -> Int
fromEnum Char
'a')
        toBits :: Int -> String
toBits Int
n = Int -> Int -> String
getBits Int
n Int
5

-- | Fixed-length encoding of ASCII characters using 8 bits.
ascii :: [(Char,String)]
ascii :: [(Char, String)]
ascii = [Char -> (Char, String)
forall {a}. Enum a => a -> (a, String)
encode Char
c | Char
c <- [Char
'\0'..Char
'\127']]
  where encode :: a -> (a, String)
encode a
c = (a
c, Int -> String
toBits (Int -> String) -> Int -> String
forall a b. (a -> b) -> a -> b
$ a -> Int
forall a. Enum a => a -> Int
fromEnum a
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
forall a. Enum a => a -> Int
fromEnum Char
'\0')
        toBits :: Int -> String
toBits Int
n = Int -> Int -> String
getBits Int
n Int
8

getBits :: Int -> Int -> String
getBits :: Int -> Int -> String
getBits Int
_ Int
0 = []
getBits Int
n Int
b = Char
bit Char -> String -> String
forall a. a -> [a] -> [a]
: Int -> Int -> String
getBits (Int
n Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
2) (Int
bInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1)
  where bit :: Char
bit | Int
n Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
2 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1 = Char
'1'
            | Bool
otherwise      = Char
'0'

-- | Long text against which various encoding schemes can be tried.
text :: String
text :: String
text =
  String
"this is going to be a very long string of text which tries to use all letters " String -> String -> String
forall a. [a] -> [a] -> [a]
++
  String
"of the alphabet  there is no punctuation and no upper case letters because i " String -> String -> String
forall a. [a] -> [a] -> [a]
++
  String
"did not want to write more code to create the encoding table  and it also means " String -> String -> String
forall a. [a] -> [a] -> [a]
++
  String
"it can use fewer bits to encode each letter with a fixed size encoding  " String -> String -> String
forall a. [a] -> [a] -> [a]
++
  String
"using most of the letters means that the five bit encoding is the smallest " String -> String -> String
forall a. [a] -> [a] -> [a]
++
  String
"fixed size encoding for this text  and i actually find it easier to write out " String -> String -> String
forall a. [a] -> [a] -> [a]
++
  String
"this text randomly instead of carefully gathering the letters used by a certain " String -> String -> String
forall a. [a] -> [a] -> [a]
++
  String
"text and getting a smaller fixed size encoding from just those letters  " String -> String -> String
forall a. [a] -> [a] -> [a]
++
  String
"it is trying to be zany for the sake of being zany  being quite long  it is also " String -> String -> String
forall a. [a] -> [a] -> [a]
++
  String
"more convincing that the extra space for the huffman coding table can be worth " String -> String -> String
forall a. [a] -> [a] -> [a]
++
  String
"the extra cost because it is less than the savings we get from encoding a long text " String -> String -> String
forall a. [a] -> [a] -> [a]
++
  String
"with huffman coding"