readerc.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. package yaml
  2. import (
  3. "io"
  4. )
  5. // Set the reader error and return 0.
  6. func yaml_parser_set_reader_error(parser *yaml_parser_t, problem string, offset int, value int) bool {
  7. parser.error = yaml_READER_ERROR
  8. parser.problem = problem
  9. parser.problem_offset = offset
  10. parser.problem_value = value
  11. return false
  12. }
  13. // Byte order marks.
  14. const (
  15. bom_UTF8 = "\xef\xbb\xbf"
  16. bom_UTF16LE = "\xff\xfe"
  17. bom_UTF16BE = "\xfe\xff"
  18. )
  19. // Determine the input stream encoding by checking the BOM symbol. If no BOM is
  20. // found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
  21. func yaml_parser_determine_encoding(parser *yaml_parser_t) bool {
  22. // Ensure that we had enough bytes in the raw buffer.
  23. for !parser.eof && len(parser.raw_buffer)-parser.raw_buffer_pos < 3 {
  24. if !yaml_parser_update_raw_buffer(parser) {
  25. return false
  26. }
  27. }
  28. // Determine the encoding.
  29. buf := parser.raw_buffer
  30. pos := parser.raw_buffer_pos
  31. avail := len(buf) - pos
  32. if avail >= 2 && buf[pos] == bom_UTF16LE[0] && buf[pos+1] == bom_UTF16LE[1] {
  33. parser.encoding = yaml_UTF16LE_ENCODING
  34. parser.raw_buffer_pos += 2
  35. parser.offset += 2
  36. } else if avail >= 2 && buf[pos] == bom_UTF16BE[0] && buf[pos+1] == bom_UTF16BE[1] {
  37. parser.encoding = yaml_UTF16BE_ENCODING
  38. parser.raw_buffer_pos += 2
  39. parser.offset += 2
  40. } else if avail >= 3 && buf[pos] == bom_UTF8[0] && buf[pos+1] == bom_UTF8[1] && buf[pos+2] == bom_UTF8[2] {
  41. parser.encoding = yaml_UTF8_ENCODING
  42. parser.raw_buffer_pos += 3
  43. parser.offset += 3
  44. } else {
  45. parser.encoding = yaml_UTF8_ENCODING
  46. }
  47. return true
  48. }
  49. // Update the raw buffer.
  50. func yaml_parser_update_raw_buffer(parser *yaml_parser_t) bool {
  51. size_read := 0
  52. // Return if the raw buffer is full.
  53. if parser.raw_buffer_pos == 0 && len(parser.raw_buffer) == cap(parser.raw_buffer) {
  54. return true
  55. }
  56. // Return on EOF.
  57. if parser.eof {
  58. return true
  59. }
  60. // Move the remaining bytes in the raw buffer to the beginning.
  61. if parser.raw_buffer_pos > 0 && parser.raw_buffer_pos < len(parser.raw_buffer) {
  62. copy(parser.raw_buffer, parser.raw_buffer[parser.raw_buffer_pos:])
  63. }
  64. parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)-parser.raw_buffer_pos]
  65. parser.raw_buffer_pos = 0
  66. // Call the read handler to fill the buffer.
  67. size_read, err := parser.read_handler(parser, parser.raw_buffer[len(parser.raw_buffer):cap(parser.raw_buffer)])
  68. parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)+size_read]
  69. if err == io.EOF {
  70. parser.eof = true
  71. } else if err != nil {
  72. return yaml_parser_set_reader_error(parser, "input error: "+err.Error(), parser.offset, -1)
  73. }
  74. return true
  75. }
  76. // Ensure that the buffer contains at least `length` characters.
  77. // Return true on success, false on failure.
  78. //
  79. // The length is supposed to be significantly less that the buffer size.
  80. func yaml_parser_update_buffer(parser *yaml_parser_t, length int) bool {
  81. if parser.read_handler == nil {
  82. panic("read handler must be set")
  83. }
  84. // If the EOF flag is set and the raw buffer is empty, do nothing.
  85. if parser.eof && parser.raw_buffer_pos == len(parser.raw_buffer) {
  86. return true
  87. }
  88. // Return if the buffer contains enough characters.
  89. if parser.unread >= length {
  90. return true
  91. }
  92. // Determine the input encoding if it is not known yet.
  93. if parser.encoding == yaml_ANY_ENCODING {
  94. if !yaml_parser_determine_encoding(parser) {
  95. return false
  96. }
  97. }
  98. // Move the unread characters to the beginning of the buffer.
  99. buffer_len := len(parser.buffer)
  100. if parser.buffer_pos > 0 && parser.buffer_pos < buffer_len {
  101. copy(parser.buffer, parser.buffer[parser.buffer_pos:])
  102. buffer_len -= parser.buffer_pos
  103. parser.buffer_pos = 0
  104. } else if parser.buffer_pos == buffer_len {
  105. buffer_len = 0
  106. parser.buffer_pos = 0
  107. }
  108. // Open the whole buffer for writing, and cut it before returning.
  109. parser.buffer = parser.buffer[:cap(parser.buffer)]
  110. // Fill the buffer until it has enough characters.
  111. first := true
  112. for parser.unread < length {
  113. // Fill the raw buffer if necessary.
  114. if !first || parser.raw_buffer_pos == len(parser.raw_buffer) {
  115. if !yaml_parser_update_raw_buffer(parser) {
  116. parser.buffer = parser.buffer[:buffer_len]
  117. return false
  118. }
  119. }
  120. first = false
  121. // Decode the raw buffer.
  122. inner:
  123. for parser.raw_buffer_pos != len(parser.raw_buffer) {
  124. var value rune
  125. var width int
  126. raw_unread := len(parser.raw_buffer) - parser.raw_buffer_pos
  127. // Decode the next character.
  128. switch parser.encoding {
  129. case yaml_UTF8_ENCODING:
  130. // Decode a UTF-8 character. Check RFC 3629
  131. // (http://www.ietf.org/rfc/rfc3629.txt) for more details.
  132. //
  133. // The following table (taken from the RFC) is used for
  134. // decoding.
  135. //
  136. // Char. number range | UTF-8 octet sequence
  137. // (hexadecimal) | (binary)
  138. // --------------------+------------------------------------
  139. // 0000 0000-0000 007F | 0xxxxxxx
  140. // 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
  141. // 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
  142. // 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  143. //
  144. // Additionally, the characters in the range 0xD800-0xDFFF
  145. // are prohibited as they are reserved for use with UTF-16
  146. // surrogate pairs.
  147. // Determine the length of the UTF-8 sequence.
  148. octet := parser.raw_buffer[parser.raw_buffer_pos]
  149. switch {
  150. case octet&0x80 == 0x00:
  151. width = 1
  152. case octet&0xE0 == 0xC0:
  153. width = 2
  154. case octet&0xF0 == 0xE0:
  155. width = 3
  156. case octet&0xF8 == 0xF0:
  157. width = 4
  158. default:
  159. // The leading octet is invalid.
  160. return yaml_parser_set_reader_error(parser,
  161. "invalid leading UTF-8 octet",
  162. parser.offset, int(octet))
  163. }
  164. // Check if the raw buffer contains an incomplete character.
  165. if width > raw_unread {
  166. if parser.eof {
  167. return yaml_parser_set_reader_error(parser,
  168. "incomplete UTF-8 octet sequence",
  169. parser.offset, -1)
  170. }
  171. break inner
  172. }
  173. // Decode the leading octet.
  174. switch {
  175. case octet&0x80 == 0x00:
  176. value = rune(octet & 0x7F)
  177. case octet&0xE0 == 0xC0:
  178. value = rune(octet & 0x1F)
  179. case octet&0xF0 == 0xE0:
  180. value = rune(octet & 0x0F)
  181. case octet&0xF8 == 0xF0:
  182. value = rune(octet & 0x07)
  183. default:
  184. value = 0
  185. }
  186. // Check and decode the trailing octets.
  187. for k := 1; k < width; k++ {
  188. octet = parser.raw_buffer[parser.raw_buffer_pos+k]
  189. // Check if the octet is valid.
  190. if (octet & 0xC0) != 0x80 {
  191. return yaml_parser_set_reader_error(parser,
  192. "invalid trailing UTF-8 octet",
  193. parser.offset+k, int(octet))
  194. }
  195. // Decode the octet.
  196. value = (value << 6) + rune(octet&0x3F)
  197. }
  198. // Check the length of the sequence against the value.
  199. switch {
  200. case width == 1:
  201. case width == 2 && value >= 0x80:
  202. case width == 3 && value >= 0x800:
  203. case width == 4 && value >= 0x10000:
  204. default:
  205. return yaml_parser_set_reader_error(parser,
  206. "invalid length of a UTF-8 sequence",
  207. parser.offset, -1)
  208. }
  209. // Check the range of the value.
  210. if value >= 0xD800 && value <= 0xDFFF || value > 0x10FFFF {
  211. return yaml_parser_set_reader_error(parser,
  212. "invalid Unicode character",
  213. parser.offset, int(value))
  214. }
  215. case yaml_UTF16LE_ENCODING, yaml_UTF16BE_ENCODING:
  216. var low, high int
  217. if parser.encoding == yaml_UTF16LE_ENCODING {
  218. low, high = 0, 1
  219. } else {
  220. high, low = 1, 0
  221. }
  222. // The UTF-16 encoding is not as simple as one might
  223. // naively think. Check RFC 2781
  224. // (http://www.ietf.org/rfc/rfc2781.txt).
  225. //
  226. // Normally, two subsequent bytes describe a Unicode
  227. // character. However a special technique (called a
  228. // surrogate pair) is used for specifying character
  229. // values larger than 0xFFFF.
  230. //
  231. // A surrogate pair consists of two pseudo-characters:
  232. // high surrogate area (0xD800-0xDBFF)
  233. // low surrogate area (0xDC00-0xDFFF)
  234. //
  235. // The following formulas are used for decoding
  236. // and encoding characters using surrogate pairs:
  237. //
  238. // U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF)
  239. // U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF)
  240. // W1 = 110110yyyyyyyyyy
  241. // W2 = 110111xxxxxxxxxx
  242. //
  243. // where U is the character value, W1 is the high surrogate
  244. // area, W2 is the low surrogate area.
  245. // Check for incomplete UTF-16 character.
  246. if raw_unread < 2 {
  247. if parser.eof {
  248. return yaml_parser_set_reader_error(parser,
  249. "incomplete UTF-16 character",
  250. parser.offset, -1)
  251. }
  252. break inner
  253. }
  254. // Get the character.
  255. value = rune(parser.raw_buffer[parser.raw_buffer_pos+low]) +
  256. (rune(parser.raw_buffer[parser.raw_buffer_pos+high]) << 8)
  257. // Check for unexpected low surrogate area.
  258. if value&0xFC00 == 0xDC00 {
  259. return yaml_parser_set_reader_error(parser,
  260. "unexpected low surrogate area",
  261. parser.offset, int(value))
  262. }
  263. // Check for a high surrogate area.
  264. if value&0xFC00 == 0xD800 {
  265. width = 4
  266. // Check for incomplete surrogate pair.
  267. if raw_unread < 4 {
  268. if parser.eof {
  269. return yaml_parser_set_reader_error(parser,
  270. "incomplete UTF-16 surrogate pair",
  271. parser.offset, -1)
  272. }
  273. break inner
  274. }
  275. // Get the next character.
  276. value2 := rune(parser.raw_buffer[parser.raw_buffer_pos+low+2]) +
  277. (rune(parser.raw_buffer[parser.raw_buffer_pos+high+2]) << 8)
  278. // Check for a low surrogate area.
  279. if value2&0xFC00 != 0xDC00 {
  280. return yaml_parser_set_reader_error(parser,
  281. "expected low surrogate area",
  282. parser.offset+2, int(value2))
  283. }
  284. // Generate the value of the surrogate pair.
  285. value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF)
  286. } else {
  287. width = 2
  288. }
  289. default:
  290. panic("impossible")
  291. }
  292. // Check if the character is in the allowed range:
  293. // #x9 | #xA | #xD | [#x20-#x7E] (8 bit)
  294. // | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit)
  295. // | [#x10000-#x10FFFF] (32 bit)
  296. switch {
  297. case value == 0x09:
  298. case value == 0x0A:
  299. case value == 0x0D:
  300. case value >= 0x20 && value <= 0x7E:
  301. case value == 0x85:
  302. case value >= 0xA0 && value <= 0xD7FF:
  303. case value >= 0xE000 && value <= 0xFFFD:
  304. case value >= 0x10000 && value <= 0x10FFFF:
  305. default:
  306. return yaml_parser_set_reader_error(parser,
  307. "control characters are not allowed",
  308. parser.offset, int(value))
  309. }
  310. // Move the raw pointers.
  311. parser.raw_buffer_pos += width
  312. parser.offset += width
  313. // Finally put the character into the buffer.
  314. if value <= 0x7F {
  315. // 0000 0000-0000 007F . 0xxxxxxx
  316. parser.buffer[buffer_len+0] = byte(value)
  317. } else if value <= 0x7FF {
  318. // 0000 0080-0000 07FF . 110xxxxx 10xxxxxx
  319. parser.buffer[buffer_len+0] = byte(0xC0 + (value >> 6))
  320. parser.buffer[buffer_len+1] = byte(0x80 + (value & 0x3F))
  321. } else if value <= 0xFFFF {
  322. // 0000 0800-0000 FFFF . 1110xxxx 10xxxxxx 10xxxxxx
  323. parser.buffer[buffer_len+0] = byte(0xE0 + (value >> 12))
  324. parser.buffer[buffer_len+1] = byte(0x80 + ((value >> 6) & 0x3F))
  325. parser.buffer[buffer_len+2] = byte(0x80 + (value & 0x3F))
  326. } else {
  327. // 0001 0000-0010 FFFF . 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  328. parser.buffer[buffer_len+0] = byte(0xF0 + (value >> 18))
  329. parser.buffer[buffer_len+1] = byte(0x80 + ((value >> 12) & 0x3F))
  330. parser.buffer[buffer_len+2] = byte(0x80 + ((value >> 6) & 0x3F))
  331. parser.buffer[buffer_len+3] = byte(0x80 + (value & 0x3F))
  332. }
  333. buffer_len += width
  334. parser.unread++
  335. }
  336. // On EOF, put NUL into the buffer and return.
  337. if parser.eof {
  338. parser.buffer[buffer_len] = 0
  339. buffer_len++
  340. parser.unread++
  341. break
  342. }
  343. }
  344. parser.buffer = parser.buffer[:buffer_len]
  345. return true
  346. }