escaping.cc (7585B)
1 // Copyright 2020 The Abseil Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "absl/strings/internal/escaping.h" 16 17 #include <limits> 18 19 #include "absl/base/internal/endian.h" 20 #include "absl/base/internal/raw_logging.h" 21 22 namespace absl { 23 ABSL_NAMESPACE_BEGIN 24 namespace strings_internal { 25 26 // The two strings below provide maps from normal 6-bit characters to their 27 // base64-escaped equivalent. 28 // For the inverse case, see kUn(WebSafe)Base64 in the external 29 // escaping.cc. 30 ABSL_CONST_INIT const char kBase64Chars[] = 31 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 32 33 ABSL_CONST_INIT const char kWebSafeBase64Chars[] = 34 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; 35 36 size_t CalculateBase64EscapedLenInternal(size_t input_len, bool do_padding) { 37 // Base64 encodes three bytes of input at a time. If the input is not 38 // divisible by three, we pad as appropriate. 39 // 40 // Base64 encodes each three bytes of input into four bytes of output. 41 constexpr size_t kMaxSize = (std::numeric_limits<size_t>::max() - 1) / 4 * 3; 42 ABSL_INTERNAL_CHECK(input_len <= kMaxSize, 43 "CalculateBase64EscapedLenInternal() overflow"); 44 size_t len = (input_len / 3) * 4; 45 46 // Since all base 64 input is an integral number of octets, only the following 47 // cases can arise: 48 if (input_len % 3 == 0) { 49 // (from https://tools.ietf.org/html/rfc3548) 50 // (1) the final quantum of encoding input is an integral multiple of 24 51 // bits; here, the final unit of encoded output will be an integral 52 // multiple of 4 characters with no "=" padding, 53 } else if (input_len % 3 == 1) { 54 // (from https://tools.ietf.org/html/rfc3548) 55 // (2) the final quantum of encoding input is exactly 8 bits; here, the 56 // final unit of encoded output will be two characters followed by two 57 // "=" padding characters, or 58 len += 2; 59 if (do_padding) { 60 len += 2; 61 } 62 } else { // (input_len % 3 == 2) 63 // (from https://tools.ietf.org/html/rfc3548) 64 // (3) the final quantum of encoding input is exactly 16 bits; here, the 65 // final unit of encoded output will be three characters followed by one 66 // "=" padding character. 67 len += 3; 68 if (do_padding) { 69 len += 1; 70 } 71 } 72 73 return len; 74 } 75 76 // ---------------------------------------------------------------------- 77 // Take the input in groups of 4 characters and turn each 78 // character into a code 0 to 63 thus: 79 // A-Z map to 0 to 25 80 // a-z map to 26 to 51 81 // 0-9 map to 52 to 61 82 // +(- for WebSafe) maps to 62 83 // /(_ for WebSafe) maps to 63 84 // There will be four numbers, all less than 64 which can be represented 85 // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively). 86 // Arrange the 6 digit binary numbers into three bytes as such: 87 // aaaaaabb bbbbcccc ccdddddd 88 // Equals signs (one or two) are used at the end of the encoded block to 89 // indicate that the text was not an integer multiple of three bytes long. 90 // ---------------------------------------------------------------------- 91 size_t Base64EscapeInternal(const unsigned char* src, size_t szsrc, char* dest, 92 size_t szdest, const char* base64, 93 bool do_padding) { 94 static const char kPad64 = '='; 95 96 if (szsrc * 4 > szdest * 3) return 0; 97 98 char* cur_dest = dest; 99 const unsigned char* cur_src = src; 100 101 char* const limit_dest = dest + szdest; 102 const unsigned char* const limit_src = src + szsrc; 103 104 // (from https://tools.ietf.org/html/rfc3548) 105 // Special processing is performed if fewer than 24 bits are available 106 // at the end of the data being encoded. A full encoding quantum is 107 // always completed at the end of a quantity. When fewer than 24 input 108 // bits are available in an input group, zero bits are added (on the 109 // right) to form an integral number of 6-bit groups. 110 // 111 // If do_padding is true, padding at the end of the data is performed. This 112 // output padding uses the '=' character. 113 114 // Three bytes of data encodes to four characters of cyphertext. 115 // So we can pump through three-byte chunks atomically. 116 if (szsrc >= 3) { // "limit_src - 3" is UB if szsrc < 3. 117 while (cur_src < limit_src - 3) { // While we have >= 32 bits. 118 uint32_t in = absl::big_endian::Load32(cur_src) >> 8; 119 120 cur_dest[0] = base64[in >> 18]; 121 in &= 0x3FFFF; 122 cur_dest[1] = base64[in >> 12]; 123 in &= 0xFFF; 124 cur_dest[2] = base64[in >> 6]; 125 in &= 0x3F; 126 cur_dest[3] = base64[in]; 127 128 cur_dest += 4; 129 cur_src += 3; 130 } 131 } 132 // To save time, we didn't update szdest or szsrc in the loop. So do it now. 133 szdest = static_cast<size_t>(limit_dest - cur_dest); 134 szsrc = static_cast<size_t>(limit_src - cur_src); 135 136 /* now deal with the tail (<=3 bytes) */ 137 switch (szsrc) { 138 case 0: 139 // Nothing left; nothing more to do. 140 break; 141 case 1: { 142 // One byte left: this encodes to two characters, and (optionally) 143 // two pad characters to round out the four-character cypherblock. 144 if (szdest < 2) return 0; 145 uint32_t in = cur_src[0]; 146 cur_dest[0] = base64[in >> 2]; 147 in &= 0x3; 148 cur_dest[1] = base64[in << 4]; 149 cur_dest += 2; 150 szdest -= 2; 151 if (do_padding) { 152 if (szdest < 2) return 0; 153 cur_dest[0] = kPad64; 154 cur_dest[1] = kPad64; 155 cur_dest += 2; 156 szdest -= 2; 157 } 158 break; 159 } 160 case 2: { 161 // Two bytes left: this encodes to three characters, and (optionally) 162 // one pad character to round out the four-character cypherblock. 163 if (szdest < 3) return 0; 164 uint32_t in = absl::big_endian::Load16(cur_src); 165 cur_dest[0] = base64[in >> 10]; 166 in &= 0x3FF; 167 cur_dest[1] = base64[in >> 4]; 168 in &= 0x00F; 169 cur_dest[2] = base64[in << 2]; 170 cur_dest += 3; 171 szdest -= 3; 172 if (do_padding) { 173 if (szdest < 1) return 0; 174 cur_dest[0] = kPad64; 175 cur_dest += 1; 176 szdest -= 1; 177 } 178 break; 179 } 180 case 3: { 181 // Three bytes left: same as in the big loop above. We can't do this in 182 // the loop because the loop above always reads 4 bytes, and the fourth 183 // byte is past the end of the input. 184 if (szdest < 4) return 0; 185 uint32_t in = 186 (uint32_t{cur_src[0]} << 16) + absl::big_endian::Load16(cur_src + 1); 187 cur_dest[0] = base64[in >> 18]; 188 in &= 0x3FFFF; 189 cur_dest[1] = base64[in >> 12]; 190 in &= 0xFFF; 191 cur_dest[2] = base64[in >> 6]; 192 in &= 0x3F; 193 cur_dest[3] = base64[in]; 194 cur_dest += 4; 195 szdest -= 4; 196 break; 197 } 198 default: 199 // Should not be reached: blocks of 4 bytes are handled 200 // in the while loop before this switch statement. 201 ABSL_RAW_LOG(FATAL, "Logic problem? szsrc = %zu", szsrc); 202 break; 203 } 204 return static_cast<size_t>(cur_dest - dest); 205 } 206 207 } // namespace strings_internal 208 ABSL_NAMESPACE_END 209 } // namespace absl