[ neovim ].git.dasho

commit fa24df32641c850283ec7f8a35e8ad505d3866e1
parent 8a0cbf04d6a60f91c69c707789b18986a6921f8f
Author: Kevin Goodsell <58675+KevinGoodsell@users.noreply.github.com>
Date:   Thu, 12 Feb 2026 14:38:11 -0800

fix(bufwrite.c): handle invalid byte sequences #37363

Problem:

bw_rest was used as an extra buffer to save incomplete byte sequences
between calls to buf_write_bytes. Besides being unnecessarily
complicated, this introduced a number of issues:

1) The bytes stored in bw_rest could still be there at the end of
writing the file, never having been written, thus losing some of the
file content on write.

2) bw_rest was not cleared out after the "checking_conversion" phase,
leaving them to affect the written file content during the writing
phase, corrupting the file.

3) bw_rest could contain extra bytes that need to be written to the
output buffer during a buf_write_convert call, potentially before any
bytes are consumed. But some conversions are in-place, without a
separate output buffer. Writing bytes from bw_rest to the "output"
buffer actually overwrote bytes from the input buffer before they were
read, corrupting the data to be written.

4) The extra bytes in bw_rest that need to be written to the conversion
output buffer were not originally accounted for in the size calculation
for the output buffer, causing a buffer overflow (previously fixed in
Vim patch 9.1.2028).

Solution:

Rather than maintaining a separate buffer, the unconverted bytes at the
end of the buffer can just be shifted to the beginning of the buffer,
and the buffer size updated. This requires a bit of refactoring, and
buf_write_convert and buf_write_convert_with_iconv need to report the
number of bytes they consumed so that buf_write_bytes can handle the
remaining bytes.

Following conversion, bw_buf can be checked for any remaining bytes.
Leftover bytes in this case result in a conversion error, which is
better than silently dropping them.

A short section of dead code was removed from buf_write_convert, for
converting a non-UTF-8 buffer to UTF-8. Neovim buffers are always UTF-8.

A few additional tests for iconv conversions have been added. Vim's
iconv tests are disabled in Neovim because they use unsupported values
for 'encoding'.
Diffstat:
M src/nvim/bufwrite.c  | 194 +++++++++++++++++++++++++++++++------------------------------------------------
M test/functional/ex_cmds/write_spec.lua  | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

2 files changed, 169 insertions(+), 119 deletions(-)
diff --git a/src/nvim/bufwrite.c b/src/nvim/bufwrite.c
@@ -80,8 +80,6 @@ struct bw_info {
   char *bw_buf;                   // buffer with data to be written
   int bw_len;                     // length of data
   int bw_flags;                   // FIO_ flags
-  uint8_t bw_rest[CONV_RESTLEN];  // not converted bytes
-  int bw_restlen;                 // nr of bytes in bw_rest[]
   int bw_first;                   // first write call
   char *bw_conv_buf;              // buffer for writing converted chars
   size_t bw_conv_buflen;          // size of bw_conv_buf
@@ -160,30 +158,17 @@ static bool ucs2bytes(unsigned c, char **pp, int flags)
   return error;
 }
 
+/// Converts a buffer encoding based on values in ip using iconv.
+///
+/// See buf_write_convert for parameters and return value details.
 static int buf_write_convert_with_iconv(struct bw_info *ip, char **bufp, int *lenp)
 {
-  const char *from;
-  size_t fromlen;
-  size_t tolen;
-
   int len = *lenp;
 
   // Convert with iconv().
-  if (ip->bw_restlen > 0) {
-    // Need to concatenate the remainder of the previous call and
-    // the bytes of the current call.  Use the end of the
-    // conversion buffer for this.
-    fromlen = (size_t)len + (size_t)ip->bw_restlen;
-    char *fp = ip->bw_conv_buf + ip->bw_conv_buflen - fromlen;
-    memmove(fp, ip->bw_rest, (size_t)ip->bw_restlen);
-    memmove(fp + ip->bw_restlen, *bufp, (size_t)len);
-    from = fp;
-    tolen = ip->bw_conv_buflen - fromlen;
-  } else {
-    from = *bufp;
-    fromlen = (size_t)len;
-    tolen = ip->bw_conv_buflen;
-  }
+  const char *from = *bufp;
+  size_t fromlen = (size_t)len;
+  size_t tolen = ip->bw_conv_buflen;
   char *to = ip->bw_conv_buf;
 
   if (ip->bw_first) {
@@ -201,91 +186,50 @@ static int buf_write_convert_with_iconv(struct bw_info *ip, char **bufp, int *le
     ip->bw_first = false;
   }
 
-  // If iconv() has an error or there is not enough room, fail.
-  if ((iconv(ip->bw_iconv_fd, (void *)&from, &fromlen, &to, &tolen)
-       == (size_t)-1 && ICONV_ERRNO != ICONV_EINVAL)
-      || fromlen > CONV_RESTLEN) {
+  // If iconv() has an error, fail.
+  if (iconv(ip->bw_iconv_fd, (void *)&from, &fromlen, &to, &tolen)
+      == (size_t)-1 && ICONV_ERRNO != ICONV_EINVAL) {
     ip->bw_conv_error = true;
-    return FAIL;
+    return -1;
   }
 
-  // copy remainder to ip->bw_rest[] to be used for the next call.
-  if (fromlen > 0) {
-    memmove(ip->bw_rest, (void *)from, fromlen);
-  }
-  ip->bw_restlen = (int)fromlen;
-
   *bufp = ip->bw_conv_buf;
   *lenp = (int)(to - ip->bw_conv_buf);
 
-  return OK;
+  return len - (int)fromlen;
 }
 
+/// Converts a buffer encoding based on values in ip.
+///
+/// @param ip buf_write_bytes context
+/// @param[in,out] bufp Pointer to the input buffer. On return, pointer to the output buffer.
+/// @param[in,out] lenp Pointer to the input buffer length. On return the referenced value is set to
+///                     the length of the output buffer.
+///
+/// @return The number of input buffer bytes consumed for the conversion, which may be less than the
+///         initial input buffer size when the buffer ends with an incomplete character sequence.
 static int buf_write_convert(struct bw_info *ip, char **bufp, int *lenp)
 {
   int flags = ip->bw_flags;  // extra flags
 
-  if (flags & FIO_UTF8) {
-    // Convert latin1 in the buffer to UTF-8 in the file.
-    char *p = ip->bw_conv_buf;              // translate to buffer
-    for (int wlen = 0; wlen < *lenp; wlen++) {
-      p += utf_char2bytes((uint8_t)(*bufp)[wlen], p);
-    }
-    *bufp = ip->bw_conv_buf;
-    *lenp = (int)(p - ip->bw_conv_buf);
-  } else if (flags & (FIO_UCS4 | FIO_UTF16 | FIO_UCS2 | FIO_LATIN1)) {
+  int wlen = *lenp;
+  if (flags & (FIO_UCS4 | FIO_UTF16 | FIO_UCS2 | FIO_LATIN1)) {
     unsigned c;
     int n = 0;
     // Convert UTF-8 bytes in the buffer to UCS-2, UCS-4, UTF-16 or
     // Latin1 chars in the file.
     // translate in-place (can only get shorter) or to buffer
     char *p = flags & FIO_LATIN1 ? *bufp : ip->bw_conv_buf;
-    for (int wlen = 0; wlen < *lenp; wlen += n) {
-      if (wlen == 0 && ip->bw_restlen != 0) {
-        // Use remainder of previous call.  Append the start of
-        // buf[] to get a full sequence.  Might still be too
-        // short!
-        int l = MIN(*lenp, CONV_RESTLEN - ip->bw_restlen);
-        memmove(ip->bw_rest + ip->bw_restlen, *bufp, (size_t)l);
-        n = utf_ptr2len_len((char *)ip->bw_rest, ip->bw_restlen + l);
-        if (n > ip->bw_restlen + *lenp) {
-          // We have an incomplete byte sequence at the end to
-          // be written.  We can't convert it without the
-          // remaining bytes.  Keep them for the next call.
-          if (ip->bw_restlen + *lenp > CONV_RESTLEN) {
-            return FAIL;
-          }
-          ip->bw_restlen += *lenp;
-          break;
-        }
-        c = (n > 1) ? (unsigned)utf_ptr2char((char *)ip->bw_rest)
-                    : ip->bw_rest[0];
-        if (n >= ip->bw_restlen) {
-          n -= ip->bw_restlen;
-          ip->bw_restlen = 0;
-        } else {
-          ip->bw_restlen -= n;
-          memmove(ip->bw_rest, ip->bw_rest + n,
-                  (size_t)ip->bw_restlen);
-          n = 0;
-        }
-      } else {
-        n = utf_ptr2len_len(*bufp + wlen, *lenp - wlen);
-        if (n > *lenp - wlen) {
-          // We have an incomplete byte sequence at the end to
-          // be written.  We can't convert it without the
-          // remaining bytes.  Keep them for the next call.
-          if (*lenp - wlen > CONV_RESTLEN) {
-            return FAIL;
-          }
-          ip->bw_restlen = *lenp - wlen;
-          memmove(ip->bw_rest, *bufp + wlen,
-                  (size_t)ip->bw_restlen);
-          break;
-        }
-        c = n > 1 ? (unsigned)utf_ptr2char(*bufp + wlen)
-                  : (uint8_t)(*bufp)[wlen];
+    for (wlen = 0; wlen < *lenp; wlen += n) {
+      n = utf_ptr2len_len(*bufp + wlen, *lenp - wlen);
+      if (n > *lenp - wlen) {
+        // We have an incomplete byte sequence at the end to
+        // be written.  We can't convert it without the
+        // remaining bytes.  Keep them for the next call.
+        break;
       }
+      c = n > 1 ? (unsigned)utf_ptr2char(*bufp + wlen)
+                : (uint8_t)(*bufp)[wlen];
       // Check that there is enough space
       if (!(flags & FIO_LATIN1)) {
         size_t need = (flags & FIO_UCS4) ? 4 : 2;
@@ -315,12 +259,10 @@ static int buf_write_convert(struct bw_info *ip, char **bufp, int *lenp)
   }
 
   if (ip->bw_iconv_fd != (iconv_t)-1) {
-    if (buf_write_convert_with_iconv(ip, bufp, lenp) == FAIL) {
-      return FAIL;
-    }
+    return buf_write_convert_with_iconv(ip, bufp, lenp);
   }
 
-  return OK;
+  return wlen;
 }
 
 /// Call write() to write a number of bytes to the file.
@@ -333,19 +275,33 @@ static int buf_write_bytes(struct bw_info *ip)
   int len = ip->bw_len;      // length of data
   int flags = ip->bw_flags;  // extra flags
 
+  int converted = len;
+  int remaining = 0;
+
   // Skip conversion when writing the BOM.
   if (!(flags & FIO_NOCONVERT)) {
-    if (buf_write_convert(ip, &buf, &len) == FAIL) {
+    if ((converted = buf_write_convert(ip, &buf, &len)) < 0) {
+      return FAIL;
+    }
+
+    remaining = ip->bw_len - converted;
+  }
+
+  ip->bw_len = remaining;
+
+  // Skip writing while checking conversion
+  if (ip->bw_fd >= 0) {
+    int wlen = (int)write_eintr(ip->bw_fd, buf, (size_t)len);
+    if (wlen < len) {
       return FAIL;
     }
   }
 
-  if (ip->bw_fd < 0) {
-    // Only checking conversion, which is OK if we get here.
-    return OK;
+  if (remaining > 0) {
+    memmove(ip->bw_buf, ip->bw_buf + converted, (size_t)remaining);
   }
-  int wlen = (int)write_eintr(ip->bw_fd, buf, (size_t)len);
-  return (wlen < len) ? FAIL : OK;
+
+  return OK;
 }
 
 /// Check modification time of file, before writing to it.
@@ -1054,7 +1010,6 @@ int buf_write(buf_T *buf, char *fname, char *sfname, linenr_T start, linenr_T en
   write_info.bw_conv_buf = NULL;
   write_info.bw_conv_error = false;
   write_info.bw_conv_error_lnum = 0;
-  write_info.bw_restlen = 0;
   write_info.bw_iconv_fd = (iconv_t)-1;
 
   // After writing a file changedtick changes but we don't want to display
@@ -1269,13 +1224,11 @@ int buf_write(buf_T *buf, char *fname, char *sfname, linenr_T start, linenr_T en
   if (converted) {
     wb_flags = get_fio_flags(fenc);
     if (wb_flags & (FIO_UCS2 | FIO_UCS4 | FIO_UTF16 | FIO_UTF8)) {
-      // overallocate a bit, in case we read incomplete multi-byte chars
-      int size = bufsize + CONV_RESTLEN;
       // Need to allocate a buffer to translate into.
       if (wb_flags & (FIO_UCS2 | FIO_UTF16 | FIO_UTF8)) {
-        write_info.bw_conv_buflen = (size_t)size * 2;
+        write_info.bw_conv_buflen = (size_t)bufsize * 2;
       } else {       // FIO_UCS4
-        write_info.bw_conv_buflen = (size_t)size * 4;
+        write_info.bw_conv_buflen = (size_t)bufsize * 4;
       }
       write_info.bw_conv_buf = verbose_try_malloc(write_info.bw_conv_buflen);
       if (!write_info.bw_conv_buf) {
@@ -1477,11 +1430,10 @@ restore_backup:
       sha256_start(&sha_ctx);
     }
 
-    write_info.bw_len = bufsize;
+    write_info.bw_len = 0;
     write_info.bw_flags = wb_flags;
     fileformat = get_fileformat_force(buf, eap);
     char *s = buffer;
-    int len = 0;
     for (lnum = start; lnum <= end; lnum++) {
       // The next while loop is done once for each character written.
       // Keep it fast!
@@ -1499,16 +1451,15 @@ restore_backup:
           *s = c;
         }
         s++;
-        if (++len != bufsize) {
+        if (++write_info.bw_len != bufsize) {
           continue;
         }
         if (buf_write_bytes(&write_info) == FAIL) {
           end = 0;                        // write error: break loop
           break;
         }
-        nchars += bufsize;
-        s = buffer;
-        len = 0;
+        nchars += bufsize - write_info.bw_len;
+        s = buffer + write_info.bw_len;
         write_info.bw_start_lnum = lnum;
       }
       // write failed or last line has no EOL: stop here
@@ -1526,26 +1477,24 @@ restore_backup:
       } else {
         *s++ = CAR;                       // EOL_MAC or EOL_DOS: write CR
         if (fileformat == EOL_DOS) {      // write CR-NL
-          if (++len == bufsize) {
+          if (++write_info.bw_len == bufsize) {
             if (buf_write_bytes(&write_info) == FAIL) {
               end = 0;                    // write error: break loop
               break;
             }
-            nchars += bufsize;
-            s = buffer;
-            len = 0;
+            nchars += bufsize - write_info.bw_len;
+            s = buffer + write_info.bw_len;
           }
           *s++ = NL;
         }
       }
-      if (++len == bufsize) {
+      if (++write_info.bw_len == bufsize) {
         if (buf_write_bytes(&write_info) == FAIL) {
           end = 0;  // Write error: break loop.
           break;
         }
-        nchars += bufsize;
-        s = buffer;
-        len = 0;
+        nchars += bufsize - write_info.bw_len;
+        s = buffer + write_info.bw_len;
 
         os_breakcheck();
         if (got_int) {
@@ -1554,12 +1503,19 @@ restore_backup:
         }
       }
     }
-    if (len > 0 && end > 0) {
-      write_info.bw_len = len;
+    if (write_info.bw_len > 0 && end > 0) {
+      int remaining = write_info.bw_len;
       if (buf_write_bytes(&write_info) == FAIL) {
         end = 0;                      // write error
       }
-      nchars += len;
+      nchars += remaining - write_info.bw_len;
+    }
+
+    // Did we convert & write everything?
+    if (end != 0 && write_info.bw_len > 0) {
+      write_info.bw_conv_error = true;
+      write_info.bw_conv_error_lnum = end;
+      end = 0;
     }
 
     if (!buf->b_p_fixeol && buf->b_p_eof) {
diff --git a/test/functional/ex_cmds/write_spec.lua b/test/functional/ex_cmds/write_spec.lua
@@ -11,6 +11,7 @@ local api = n.api
 local skip = t.skip
 local is_os = t.is_os
 local is_ci = t.is_ci
+local read_file = t.read_file
 
 local fname = 'Xtest-functional-ex_cmds-write'
 local fname_bak = fname .. '~'
@@ -181,6 +182,99 @@ describe(':write', function()
     vim.uv.fs_symlink(fname_bak .. ('/xxxxx'):rep(20), fname)
     eq("Vim(write):E166: Can't open linked file for writing", pcall_err(command, 'write!'))
   end)
+
+  it('fails converting a trailing incomplete sequence', function()
+    -- From https://github.com/neovim/neovim/issues/36990, an invalid UTF-8 sequence at the end of
+    -- the file during conversion testing can overwrite the rest of the file during the real
+    -- conversion.
+
+    api.nvim_buf_set_lines(0, 0, 1, true, { 'line 1', 'line 2', 'aaabbb\235\128' })
+    command('set noendofline nofixendofline')
+
+    eq(
+      "Vim(write):E513: Write error, conversion failed in line 3 (make 'fenc' empty to override)",
+      pcall_err(command, 'write ++enc=latin1 ' .. fname)
+    )
+  end)
+
+  it('converts to latin1 with an invalid sequence at buffer boundary', function()
+    -- From https://github.com/neovim/neovim/issues/36990, an invalid UTF-8 sequence that falls
+    -- right at the end of the 8 KiB buffer used for encoding conversions causes subsequent data to
+    -- be overwritten.
+
+    local content = string.rep('a', 1024 * 8 - 1) .. '\251' .. string.rep('b', 20)
+    api.nvim_buf_set_lines(0, 0, 1, true, { content })
+    command('set noendofline nofixendofline fenc=latin1')
+    command('write ' .. fname)
+
+    local tail = string.sub(read_file(fname) or '', -10)
+    eq('bbbbbbbbbb', tail)
+  end)
+
+  it('converts to CP1251 with iconv', function()
+    api.nvim_buf_set_lines(
+      0,
+      0,
+      1,
+      true,
+      { 'Привет, мир!', 'Это простой тест.' }
+    )
+    command('write ++enc=cp1251 ++ff=unix ' .. fname)
+
+    eq(
+      '\207\240\232\226\229\242, \236\232\240!\n'
+        .. '\221\242\238 \239\240\238\241\242\238\233 \242\229\241\242.\n',
+      read_file(fname)
+    )
+  end)
+
+  it('converts to GB18030 with iconv', function()
+    api.nvim_buf_set_lines(0, 0, 1, true, { '你好，世界！', '这是一个测试。' })
+    command('write ++enc=gb18030 ++ff=unix ' .. fname)
+
+    eq(
+      '\196\227\186\195\163\172\202\192\189\231\163\161\n'
+        .. '\213\226\202\199\210\187\184\246\178\226\202\212\161\163\n',
+      read_file(fname)
+    )
+  end)
+
+  it('converts to Shift_JIS with iconv', function()
+    api.nvim_buf_set_lines(
+      0,
+      0,
+      1,
+      true,
+      { 'こんにちは、世界！', 'これはテストです。' }
+    )
+    command('write ++enc=sjis ++ff=unix ' .. fname)
+
+    eq(
+      '\130\177\130\241\130\201\130\191\130\205\129A\144\162\138E\129I\n'
+        .. '\130\177\130\234\130\205\131e\131X\131g\130\197\130\183\129B\n',
+      read_file(fname)
+    )
+  end)
+
+  it('fails converting an illegal sequence with iconv', function()
+    api.nvim_buf_set_lines(0, 0, 1, true, { 'line 1', 'aaa\128bbb' })
+
+    eq(
+      "Vim(write):E513: Write error, conversion failed (make 'fenc' empty to override)",
+      pcall_err(command, 'write ++enc=cp1251 ' .. fname)
+    )
+  end)
+
+  it('handles a multi-byte sequence crossing the buffer boundary converting with iconv', function()
+    local content = string.rep('a', 1024 * 8 - 1) .. 'Дbbbbb'
+    api.nvim_buf_set_lines(0, 0, 1, true, { content })
+    -- Skip the backup so we're testing the "checking" phase also.
+    command('set nowritebackup')
+    command('write ++enc=cp1251 ++ff=unix ' .. fname)
+
+    local expected = string.rep('a', 1024 * 8 - 1) .. '\196bbbbb\n'
+    eq(expected, read_file(fname))
+  end)
 end)
 
 describe(':update', function()

	neovim Neovim text editor
	git clone https://git.dasho.dev/neovim.git
	Log \| Files \| Refs \| README

M	src/nvim/bufwrite.c	\|	194	+++++++++++++++++++++++++++++++------------------------------------------------
M	test/functional/ex_cmds/write_spec.lua	\|	94	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++