reader.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. /*
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. */
  12. package candiedyaml
  13. import (
  14. "io"
  15. )
  16. /*
  17. * Set the reader error and return 0.
  18. */
  19. func yaml_parser_set_reader_error(parser *yaml_parser_t, problem string,
  20. offset int, value int) bool {
  21. parser.error = yaml_READER_ERROR
  22. parser.problem = problem
  23. parser.problem_offset = offset
  24. parser.problem_value = value
  25. return false
  26. }
  27. /*
  28. * Byte order marks.
  29. */
  30. const (
  31. BOM_UTF8 = "\xef\xbb\xbf"
  32. BOM_UTF16LE = "\xff\xfe"
  33. BOM_UTF16BE = "\xfe\xff"
  34. )
  35. /*
  36. * Determine the input stream encoding by checking the BOM symbol. If no BOM is
  37. * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
  38. */
  39. func yaml_parser_determine_encoding(parser *yaml_parser_t) bool {
  40. /* Ensure that we had enough bytes in the raw buffer. */
  41. for !parser.eof &&
  42. len(parser.raw_buffer)-parser.raw_buffer_pos < 3 {
  43. if !yaml_parser_update_raw_buffer(parser) {
  44. return false
  45. }
  46. }
  47. /* Determine the encoding. */
  48. raw := parser.raw_buffer
  49. pos := parser.raw_buffer_pos
  50. remaining := len(raw) - pos
  51. if remaining >= 2 &&
  52. raw[pos] == BOM_UTF16LE[0] && raw[pos+1] == BOM_UTF16LE[1] {
  53. parser.encoding = yaml_UTF16LE_ENCODING
  54. parser.raw_buffer_pos += 2
  55. parser.offset += 2
  56. } else if remaining >= 2 &&
  57. raw[pos] == BOM_UTF16BE[0] && raw[pos+1] == BOM_UTF16BE[1] {
  58. parser.encoding = yaml_UTF16BE_ENCODING
  59. parser.raw_buffer_pos += 2
  60. parser.offset += 2
  61. } else if remaining >= 3 &&
  62. raw[pos] == BOM_UTF8[0] && raw[pos+1] == BOM_UTF8[1] && raw[pos+2] == BOM_UTF8[2] {
  63. parser.encoding = yaml_UTF8_ENCODING
  64. parser.raw_buffer_pos += 3
  65. parser.offset += 3
  66. } else {
  67. parser.encoding = yaml_UTF8_ENCODING
  68. }
  69. return true
  70. }
  71. /*
  72. * Update the raw buffer.
  73. */
  74. func yaml_parser_update_raw_buffer(parser *yaml_parser_t) bool {
  75. size_read := 0
  76. /* Return if the raw buffer is full. */
  77. if parser.raw_buffer_pos == 0 && len(parser.raw_buffer) == cap(parser.raw_buffer) {
  78. return true
  79. }
  80. /* Return on EOF. */
  81. if parser.eof {
  82. return true
  83. }
  84. /* Move the remaining bytes in the raw buffer to the beginning. */
  85. if parser.raw_buffer_pos > 0 && parser.raw_buffer_pos < len(parser.raw_buffer) {
  86. copy(parser.raw_buffer, parser.raw_buffer[parser.raw_buffer_pos:])
  87. }
  88. parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)-parser.raw_buffer_pos]
  89. parser.raw_buffer_pos = 0
  90. /* Call the read handler to fill the buffer. */
  91. size_read, err := parser.read_handler(parser,
  92. parser.raw_buffer[len(parser.raw_buffer):cap(parser.raw_buffer)])
  93. parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)+size_read]
  94. if err == io.EOF {
  95. parser.eof = true
  96. } else if err != nil {
  97. return yaml_parser_set_reader_error(parser, "input error: "+err.Error(),
  98. parser.offset, -1)
  99. }
  100. return true
  101. }
  102. /*
  103. * Ensure that the buffer contains at least `length` characters.
  104. * Return 1 on success, 0 on failure.
  105. *
  106. * The length is supposed to be significantly less that the buffer size.
  107. */
  108. func yaml_parser_update_buffer(parser *yaml_parser_t, length int) bool {
  109. /* Read handler must be set. */
  110. if parser.read_handler == nil {
  111. panic("read handler must be set")
  112. }
  113. /* If the EOF flag is set and the raw buffer is empty, do nothing. */
  114. if parser.eof && parser.raw_buffer_pos == len(parser.raw_buffer) {
  115. return true
  116. }
  117. /* Return if the buffer contains enough characters. */
  118. if parser.unread >= length {
  119. return true
  120. }
  121. /* Determine the input encoding if it is not known yet. */
  122. if parser.encoding == yaml_ANY_ENCODING {
  123. if !yaml_parser_determine_encoding(parser) {
  124. return false
  125. }
  126. }
  127. /* Move the unread characters to the beginning of the buffer. */
  128. buffer_end := len(parser.buffer)
  129. if 0 < parser.buffer_pos &&
  130. parser.buffer_pos < buffer_end {
  131. copy(parser.buffer, parser.buffer[parser.buffer_pos:])
  132. buffer_end -= parser.buffer_pos
  133. parser.buffer_pos = 0
  134. } else if parser.buffer_pos == buffer_end {
  135. buffer_end = 0
  136. parser.buffer_pos = 0
  137. }
  138. parser.buffer = parser.buffer[:cap(parser.buffer)]
  139. /* Fill the buffer until it has enough characters. */
  140. first := true
  141. for parser.unread < length {
  142. /* Fill the raw buffer if necessary. */
  143. if !first || parser.raw_buffer_pos == len(parser.raw_buffer) {
  144. if !yaml_parser_update_raw_buffer(parser) {
  145. parser.buffer = parser.buffer[:buffer_end]
  146. return false
  147. }
  148. }
  149. first = false
  150. /* Decode the raw buffer. */
  151. for parser.raw_buffer_pos != len(parser.raw_buffer) {
  152. var value rune
  153. var w int
  154. raw_unread := len(parser.raw_buffer) - parser.raw_buffer_pos
  155. incomplete := false
  156. /* Decode the next character. */
  157. switch parser.encoding {
  158. case yaml_UTF8_ENCODING:
  159. /*
  160. * Decode a UTF-8 character. Check RFC 3629
  161. * (http://www.ietf.org/rfc/rfc3629.txt) for more details.
  162. *
  163. * The following table (taken from the RFC) is used for
  164. * decoding.
  165. *
  166. * Char. number range | UTF-8 octet sequence
  167. * (hexadecimal) | (binary)
  168. * --------------------+------------------------------------
  169. * 0000 0000-0000 007F | 0xxxxxxx
  170. * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
  171. * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
  172. * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  173. *
  174. * Additionally, the characters in the range 0xD800-0xDFFF
  175. * are prohibited as they are reserved for use with UTF-16
  176. * surrogate pairs.
  177. */
  178. /* Determine the length of the UTF-8 sequence. */
  179. octet := parser.raw_buffer[parser.raw_buffer_pos]
  180. w = width(octet)
  181. /* Check if the leading octet is valid. */
  182. if w == 0 {
  183. return yaml_parser_set_reader_error(parser,
  184. "invalid leading UTF-8 octet",
  185. parser.offset, int(octet))
  186. }
  187. /* Check if the raw buffer contains an incomplete character. */
  188. if w > raw_unread {
  189. if parser.eof {
  190. return yaml_parser_set_reader_error(parser,
  191. "incomplete UTF-8 octet sequence",
  192. parser.offset, -1)
  193. }
  194. incomplete = true
  195. break
  196. }
  197. /* Decode the leading octet. */
  198. switch {
  199. case octet&0x80 == 0x00:
  200. value = rune(octet & 0x7F)
  201. case octet&0xE0 == 0xC0:
  202. value = rune(octet & 0x1F)
  203. case octet&0xF0 == 0xE0:
  204. value = rune(octet & 0x0F)
  205. case octet&0xF8 == 0xF0:
  206. value = rune(octet & 0x07)
  207. default:
  208. value = 0
  209. }
  210. /* Check and decode the trailing octets. */
  211. for k := 1; k < w; k++ {
  212. octet = parser.raw_buffer[parser.raw_buffer_pos+k]
  213. /* Check if the octet is valid. */
  214. if (octet & 0xC0) != 0x80 {
  215. return yaml_parser_set_reader_error(parser,
  216. "invalid trailing UTF-8 octet",
  217. parser.offset+k, int(octet))
  218. }
  219. /* Decode the octet. */
  220. value = (value << 6) + rune(octet&0x3F)
  221. }
  222. /* Check the length of the sequence against the value. */
  223. switch {
  224. case w == 1:
  225. case w == 2 && value >= 0x80:
  226. case w == 3 && value >= 0x800:
  227. case w == 4 && value >= 0x10000:
  228. default:
  229. return yaml_parser_set_reader_error(parser,
  230. "invalid length of a UTF-8 sequence",
  231. parser.offset, -1)
  232. }
  233. /* Check the range of the value. */
  234. if (value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF {
  235. return yaml_parser_set_reader_error(parser,
  236. "invalid Unicode character",
  237. parser.offset, int(value))
  238. }
  239. case yaml_UTF16LE_ENCODING,
  240. yaml_UTF16BE_ENCODING:
  241. var low, high int
  242. if parser.encoding == yaml_UTF16LE_ENCODING {
  243. low, high = 0, 1
  244. } else {
  245. high, low = 1, 0
  246. }
  247. /*
  248. * The UTF-16 encoding is not as simple as one might
  249. * naively think. Check RFC 2781
  250. * (http://www.ietf.org/rfc/rfc2781.txt).
  251. *
  252. * Normally, two subsequent bytes describe a Unicode
  253. * character. However a special technique (called a
  254. * surrogate pair) is used for specifying character
  255. * values larger than 0xFFFF.
  256. *
  257. * A surrogate pair consists of two pseudo-characters:
  258. * high surrogate area (0xD800-0xDBFF)
  259. * low surrogate area (0xDC00-0xDFFF)
  260. *
  261. * The following formulas are used for decoding
  262. * and encoding characters using surrogate pairs:
  263. *
  264. * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF)
  265. * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF)
  266. * W1 = 110110yyyyyyyyyy
  267. * W2 = 110111xxxxxxxxxx
  268. *
  269. * where U is the character value, W1 is the high surrogate
  270. * area, W2 is the low surrogate area.
  271. */
  272. /* Check for incomplete UTF-16 character. */
  273. if raw_unread < 2 {
  274. if parser.eof {
  275. return yaml_parser_set_reader_error(parser,
  276. "incomplete UTF-16 character",
  277. parser.offset, -1)
  278. }
  279. incomplete = true
  280. break
  281. }
  282. /* Get the character. */
  283. value = rune(parser.raw_buffer[parser.raw_buffer_pos+low]) +
  284. (rune(parser.raw_buffer[parser.raw_buffer_pos+high]) << 8)
  285. /* Check for unexpected low surrogate area. */
  286. if (value & 0xFC00) == 0xDC00 {
  287. return yaml_parser_set_reader_error(parser,
  288. "unexpected low surrogate area",
  289. parser.offset, int(value))
  290. }
  291. /* Check for a high surrogate area. */
  292. if (value & 0xFC00) == 0xD800 {
  293. w = 4
  294. /* Check for incomplete surrogate pair. */
  295. if raw_unread < 4 {
  296. if parser.eof {
  297. return yaml_parser_set_reader_error(parser,
  298. "incomplete UTF-16 surrogate pair",
  299. parser.offset, -1)
  300. }
  301. incomplete = true
  302. break
  303. }
  304. /* Get the next character. */
  305. value2 := rune(parser.raw_buffer[parser.raw_buffer_pos+low+2]) +
  306. (rune(parser.raw_buffer[parser.raw_buffer_pos+high+2]) << 8)
  307. /* Check for a low surrogate area. */
  308. if (value2 & 0xFC00) != 0xDC00 {
  309. return yaml_parser_set_reader_error(parser,
  310. "expected low surrogate area",
  311. parser.offset+2, int(value2))
  312. }
  313. /* Generate the value of the surrogate pair. */
  314. value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF)
  315. } else {
  316. w = 2
  317. }
  318. break
  319. default:
  320. panic("Impossible") /* Impossible. */
  321. }
  322. /* Check if the raw buffer contains enough bytes to form a character. */
  323. if incomplete {
  324. break
  325. }
  326. /*
  327. * Check if the character is in the allowed range:
  328. * #x9 | #xA | #xD | [#x20-#x7E] (8 bit)
  329. * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit)
  330. * | [#x10000-#x10FFFF] (32 bit)
  331. */
  332. if !(value == 0x09 || value == 0x0A || value == 0x0D ||
  333. (value >= 0x20 && value <= 0x7E) ||
  334. (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF) ||
  335. (value >= 0xE000 && value <= 0xFFFD) ||
  336. (value >= 0x10000 && value <= 0x10FFFF)) {
  337. return yaml_parser_set_reader_error(parser,
  338. "control characters are not allowed",
  339. parser.offset, int(value))
  340. }
  341. /* Move the raw pointers. */
  342. parser.raw_buffer_pos += w
  343. parser.offset += w
  344. /* Finally put the character into the buffer. */
  345. /* 0000 0000-0000 007F . 0xxxxxxx */
  346. if value <= 0x7F {
  347. parser.buffer[buffer_end] = byte(value)
  348. } else if value <= 0x7FF {
  349. /* 0000 0080-0000 07FF . 110xxxxx 10xxxxxx */
  350. parser.buffer[buffer_end] = byte(0xC0 + (value >> 6))
  351. parser.buffer[buffer_end+1] = byte(0x80 + (value & 0x3F))
  352. } else if value <= 0xFFFF {
  353. /* 0000 0800-0000 FFFF . 1110xxxx 10xxxxxx 10xxxxxx */
  354. parser.buffer[buffer_end] = byte(0xE0 + (value >> 12))
  355. parser.buffer[buffer_end+1] = byte(0x80 + ((value >> 6) & 0x3F))
  356. parser.buffer[buffer_end+2] = byte(0x80 + (value & 0x3F))
  357. } else {
  358. /* 0001 0000-0010 FFFF . 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  359. parser.buffer[buffer_end] = byte(0xF0 + (value >> 18))
  360. parser.buffer[buffer_end+1] = byte(0x80 + ((value >> 12) & 0x3F))
  361. parser.buffer[buffer_end+2] = byte(0x80 + ((value >> 6) & 0x3F))
  362. parser.buffer[buffer_end+3] = byte(0x80 + (value & 0x3F))
  363. }
  364. buffer_end += w
  365. parser.unread++
  366. }
  367. /* On EOF, put NUL into the buffer and return. */
  368. if parser.eof {
  369. parser.buffer[buffer_end] = 0
  370. buffer_end++
  371. parser.buffer = parser.buffer[:buffer_end]
  372. parser.unread++
  373. return true
  374. }
  375. }
  376. parser.buffer = parser.buffer[:buffer_end]
  377. return true
  378. }