Code Snippets 
Data/UTF8.hs 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
 {-# LANGUAGE Safe #-} 
-------------------------------------------------------------------------------- 
module  Data.UTF8 
  (  fromSingleBytes 
  ,  toSingleBytes 
  )  where 
-------------------------------------------------------------------------------- 
import            Data.Bits 
  (  ( .&. ) 
  ,  ( .|. ) 
  ,  shiftL 
  ,  shiftR 
  ) 
import            Data.Word 
  (  Word8 
  ) 
-------------------------------------------------------------------------------- 
fromSingleBytes 
  ::  [  Word8  ] 
  ->  Either  String  [  Char  ] 
toSingleBytes 
  ::  [  Char  ] 
  ->  Either  String  [  Word8  ] 
-------------------------------------------------------------------------------- 
l2s 
  ::  ( Ord  a ,  Num  a ) 
  =>  [ a ] 
  ->  Bool 
l3s 
  ::  ( Ord  a ,  Num  a ) 
  =>  [ a ] 
  ->  Bool 
  
l4s 
  ::  ( Ord  a ,  Num  a ) 
  =>  [ a ] 
  ->  Bool 
fsb 
  ::  [  Word8  ] 
  ->  Either  String  [  Char  ] 
u2a 
  ::  Char 
  ->  Either  String  [  Word8  ] 
tsb 
  ::  [  Char  ] 
  ->  Either  String  [  Word8  ] 
msg 
  ::  String 
  ->  String 
-------------------------------------------------------------------------------- 
fromSingleBytes  = 
  fsb 
toSingleBytes  = 
  tsb 
-------------------------------------------------------------------------------- 
{-
* Table 3.1. UTF-8 Bit Distribution
+----------------------------+----------+----------+----------+----------+
| Scalar                     | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
+----------------------------+----------+----------+----------+----------+
| 00000000 0xxxxxxx          | 0xxxxxxx |          |          |          |
| 00000yyy yyxxxxxx          | 110yyyyy | 10xxxxxx |          |          |
| zzzzyyyy yyxxxxxx          | 1110zzzz | 10yyyyyy | 10xxxxxx |          |
| 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx |
+----------------------------+----------+----------+----------+----------+
* Table 3.1B. Legal UTF-8 Byte Sequences
+----------------------+----------+----------+----------+----------+
| Code Points          | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
+----------------------+----------+----------+----------+----------+
| U+000000 .. U+00007F | 00 .. 7F |          |          |          |
| U+000080 .. U+0007FF | C2 .. DF | 80 .. BF |          |          |
| U+000800 .. U+000FFF | E0       | A0 .. BF | 80 .. BF |          |
| U+001000 .. U+00FFFF | E1 .. EF | 80 .. BF | 80 .. BF |          |
| U+010000 .. U+03FFFF | F0       | 90 .. BF | 80 .. BF | 80 .. BF |
| U+040000 .. U+0FFFFF | F1 .. F3 | 80 .. BF | 80 .. BF | 80 .. BF |
| U+100000 .. U+10FFFF | F4       | 80 .. 8F | 80 .. BF | 80 .. BF |
+----------------------+----------+----------+----------+----------+
Source: https://unicode.org/versions/corrigendum1.html
-} 
l2s  ( b1 : b2 : [] )  = 
  -- U+000080 .. U+0007FF | C2 .. DF | 80 .. BF 
  b1  >  0xC1  &&  b1  <  0xE0  && 
  b2  >  0x7F  &&  b2  <  0xC0 
l2s  _  =  False 
l3s  ( b1 : b2 : b3 : [] )  = 
  -- U+000800 .. U+000FFF | E0       | A0 .. BF | 80 .. BF 
  -- U+001000 .. U+00FFFF | E1 .. EF | 80 .. BF | 80 .. BF 
  ( b1         ==       0xE0  && 
   b2  >  0x9F  &&  b2  <  0xC0  && 
   b3  >  0x7F  &&  b3  <  0xC0 ) 
  || 
  ( b1  >  0xE0  &&  b1  <  0xF0  && 
   b2  >  0x7F  &&  b2  <  0xC0  && 
   b3  >  0x7F  &&  b3  <  0xC0 )   
l3s  _  =  False 
l4s  ( b1 : b2 : b3 : b4 : [] )  = 
  -- U+010000 .. U+03FFFF | F0       | 90 .. BF | 80 .. BF | 80 .. BF 
  -- U+040000 .. U+0FFFFF | F1 .. F3 | 80 .. BF | 80 .. BF | 80 .. BF 
  -- U+100000 .. U+10FFFF | F4       | 80 .. 8F | 80 .. BF | 80 .. BF 
  ( b1         ==       0xF0  && 
   b2  >  0x8F  &&  b2  <  0xC0  && 
   b3  >  0x7F  &&  b3  <  0xC0  && 
   b4  >  0x7F  &&  b4  <  0xC0 ) 
  || 
  ( b1  >  0xF0  &&  b1  <  0xF4  && 
   b2  >  0x7F  &&  b2  <  0xC0  && 
   b3  >  0x7F  &&  b3  <  0xC0  && 
   b4  >  0x7F  &&  b4  <  0xC0 ) 
  || 
  ( b1         ==       0xF4  && 
   b2  >  0x7F  &&  b2  <  0x90  && 
   b3  >  0x7F  &&  b3  <  0xC0  && 
   b4  >  0x7F  &&  b4  <  0xC0 ) 
l4s  _  =  False 
fsb  [        ]  = 
  Right  [] 
fsb  bs  @  ( w : _ ) 
  -- "w" can't be less than 0 or greater than 0xFF (HS Word8 constraints) 
  |  w  <  0x80   =  bs01  bs 
  |  w  <  0xC2   =  Left  $  msg  $  "Byte: "  ++  show  w 
  |  w  <  0xE0   =  bs02  bs 
  |  w  <  0xF0   =  bs03  bs 
  |  w  <  0xF5   =  bs04  bs 
  |  otherwise  =  Left  $  msg  $  "Byte: "  ++  show  w 
  where 
    ( .<. )  x  y  =  x  ` shiftL `  y 
    char  =  toEnum  .  fromIntegral 
    bs01  ( b1 : xs )  = 
      fsb  xs  >>=  \ ys  -> 
      Right  $  char  b1  :  ys 
    bs01  _______  = 
      Left  $  "Exhaustive pattern match." 
    bs02  ( b1 : b2 : xs ) 
      |  l2s  ( b1 : b2 : [] )  = 
        let 
          a  =  ( fromIntegral  $  b1  .&.  0x1F  ::  Int )  .<.  0006 
          b  =   fromIntegral  $  b2  .&.  0x3F  ::  Int 
        in 
          fsb  xs  >>=  \ ys  -> 
          Right 
          $  toEnum  ( a  .|.  b )  :  ys 
      |  otherwise  = 
        Left  $  msg  $  "Two bytes: "  ++  ( show  $  b1 : b2 : [] ) 
    bs02  _________  = 
      Left  $  "Exhaustive pattern match." 
    bs03  ( b1 : b2 : b3 : xs ) 
      |  l3s  $  b1 : b2 : b3 : []  = 
        let 
          a  =  ( fromIntegral  $  b1  .&.  0x0F  ::  Int )  .<.  0012 
          b  =  ( fromIntegral  $  b2  .&.  0x3F  ::  Int )  .<.  0006 
          c  =   fromIntegral  $  b3  .&.  0x3F  ::  Int 
        in 
          fsb  xs  >>=  \ ys  -> 
          Right 
          $  toEnum  ( a  .|.  b  .|.  c )  :  ys 
      |  otherwise  = 
        Left  $  msg  $  "Three bytes: "  ++  ( show  $  b1 : b2 : b3 : [] ) 
    bs03  _____________  = 
      Left  $  "Exhaustive pattern match." 
    bs04  ( b1 : b2 : b3 : b4 : xs ) 
      |  l4s  $  b1 : b2 : b3 : b4 : []  = 
        let 
          a  =  ( fromIntegral  $  b1  .&.  0x07  ::  Int )  .<.  0018 
          b  =  ( fromIntegral  $  b2  .&.  0x3F  ::  Int )  .<.  0012 
          c  =  ( fromIntegral  $  b3  .&.  0x3F  ::  Int )  .<.  0006 
          d  =   fromIntegral  $  b4  .&.  0x3F  ::  Int 
        in 
          fsb  xs  >>=  \ ys  -> 
          Right 
          $  toEnum  ( a  .|.  b  .|.  c  .|.  d )  :  ys 
      |  otherwise  = 
        Left  $  msg  $  "Four bytes: "  ++  ( show  $  b1 : b2 : b3 : b4 : [] ) 
    bs04  ________________  = 
      Left  $  "Exhaustive pattern match." 
u2a  c 
  -- "word" can't be less than 0 or greater than 0x10FFFF (HS Char constraints) 
  |  word  <  0x000080  =  bs01 
  |  word  <  0x000800  =  bs02 
  |  word  <  0x010000  =  bs03 
  |  otherwise        =  bs04 
  where 
    ( .>. )  x  y  =  x  ` shiftR `  y 
    word  =  fromEnum  c 
    bs01  = 
      Right 
      [  fromIntegral  $  word 
      ] 
    bs02  = 
      Right 
      [  fromIntegral  $  word  .>.  0006           .|.  0xC0 
      ,  fromIntegral  $  word           .&.  0x3F  .|.  0x80 
      ] 
      >>=  valid  l2s  "Two bytes: " 
    bs03  = 
      Right 
      [  fromIntegral  $  word  .>.  0012           .|.  0xE0 
      ,  fromIntegral  $  word  .>.  0006  .&.  0x3F  .|.  0x80 
      ,  fromIntegral  $  word           .&.  0x3F  .|.  0x80 
      ] 
      >>=  valid  l3s  "Three bytes: " 
    bs04  = 
      Right 
      [  fromIntegral  $  word  .>.  0018           .|.  0xF0 
      ,  fromIntegral  $  word  .>.  0012  .&.  0x3F  .|.  0x80 
      ,  fromIntegral  $  word  .>.  0006  .&.  0x3F  .|.  0x80 
      ,  fromIntegral  $  word           .&.  0x3F  .|.  0x80 
      ] 
      >>=  valid  l4s  "Four bytes: " 
    valid  lfn  txt  bs  = 
      if 
        lfn  $  bs 
      then 
        Right  bs 
      else 
        Left  $  msg  $  txt  ++  ( show  $  bs ) 
tsb  [    ]  = 
  Right  [] 
tsb  ( x : xs )  = 
  u2a  x   >>=  \ ws  -> 
  tsb  xs  >>=  \ ys  -> 
  Right  $  ws  ++  ys 
msg  x  = 
  x 
  ++  " is/are not a Legal UTF-8 Byte Sequence, see Table 3.1B at: \n " 
  ++  " -- http://unicode.org/versions/corrigendum1.html" 
 
FSB.hs 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
 {-# LANGUAGE Safe #-} 
-------------------------------------------------------------------------------- 
module  Main  where 
import  qualified  Data.UTF8  as  UTF8 
-------------------------------------------------------------------------------- 
main 
  ::  IO  () 
-------------------------------------------------------------------------------- 
class  Monad  m  =>  InteractM  m  where 
  interact' 
    ::  ( String  ->  String )  ->  m  () 
-------------------------------------------------------------------------------- 
instance  InteractM  IO  where 
  interact' 
    =  interact 
-------------------------------------------------------------------------------- 
main  = 
  interact' 
  $ 
  \ xs  -> 
    case  UTF8 . fromSingleBytes  $  map  ( fromIntegral  .  fromEnum )  xs  of 
      Left  msg  -> 
        msg 
      Right  ys  -> 
        ys 
 
TSB.hs 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
 {-# LANGUAGE Safe #-} 
-------------------------------------------------------------------------------- 
module  Main  where 
import  qualified  Data.UTF8  as  UTF8 
-------------------------------------------------------------------------------- 
main 
  ::  IO  () 
-------------------------------------------------------------------------------- 
class  Monad  m  =>  InteractM  m  where 
  interact' 
    ::  ( String  ->  String )  ->  m  () 
-------------------------------------------------------------------------------- 
instance  InteractM  IO  where 
  interact' 
    =  interact 
-------------------------------------------------------------------------------- 
main  = 
  interact' 
  $ 
  \ xs  -> 
    case  UTF8 . toSingleBytes  xs  of 
      Left  msg  -> 
        msg 
      Right  ys  -> 
        map  ( toEnum  .  fromIntegral )  ys 
 
Build Snippet 
build.bash 
1
2
3
4
5
6
7
8
9
10
11
12
13
 #!/bin/bash 
clear
# tsb 
ghc -Wall  -Werror  -O3  --make  TSB.hs -o  tsb
# fsb 
ghc -Wall  -Werror  -O3  --make  FSB.hs -o  fsb
# clean 
find .  -name  '*.hi'  -delete 
find .  -name  '*.o'   -delete 
 
Output: 
user@personal:~/.../src$ echo "Haskell in Japanese (ハスケル) and Chinese (哈斯克爾)" \
    | ./tsb \
    | ./fsb 
Haskell in Japanese (ハスケル) and Chinese (哈斯克爾)
user@personal:~/.../src$  
References: