Skip to content

Commit 80c4d4e

Browse files
brandonbloomclaude
andcommitted
Add binary output detection and file reference handling
- Implement automatic binary detection using null bytes and 10% unprintable character threshold - Add file reference opcodes (1<, 2<) for external file content - Support file reference preservation during transcript updates - Integrate commandBufferingHandler into Updater type for better cohesion - Add comprehensive test coverage for binary detection and file handling - Update documentation with binary output section and file reference opcodes 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]>
1 parent e976575 commit 80c4d4e

28 files changed

Lines changed: 524 additions & 33 deletions

CLAUDE.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
- Build with `./build.sh` instead of using `go build` directly.
22

3-
- .cmdt files use the Command Transcript language implemented by this project. Consult @README.md for the syntax and @tests/*.cmdt for example usages.
3+
- .cmdt files use the Command Transcript language implemented by this project. Consult @README.md for the syntax and @tests/*.cmdt for example usages.
4+
5+
- Always test using ./test.sh from the root directory.

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,15 @@ Operations with the following opcodes are supported:
147147
</p>
148148
</dd>
149149

150+
<dt><code>1&lt;</code>, <code>2&lt;</code> &mdash; file output</dt>
151+
<dd>
152+
<p>
153+
Like <code>1</code> and <code>2</code>, but reference a file containing
154+
the expected output instead of including it inline. File paths respect
155+
the current working directory.
156+
</p>
157+
</dd>
158+
150159
<dt><code>?</code> &mdash; exit-code</dt>
151160
<dd>
152161
<p>Exit code of the previously run command.</p>
@@ -192,6 +201,14 @@ Transcript inherits the working directory from the process that launches it.
192201
Directory changes (such as `cd` commands) persist throughout the transcript
193202
session, allowing tests to navigate and use relative paths consistently.
194203

204+
## Binary Output
205+
206+
Transcript automatically detects binary output using heuristics. Lines of plain
207+
text are recorded inline using the standard `1` and `2` opcodes, while spans of
208+
binary data are written to incrementally numbered files (`001.bin`, `002.bin`,
209+
etc.) and referenced using `1<` and `2<` opcodes. This applies to both shell
210+
recording and automatic updating.
211+
195212
# Go API
196213

197214
In addition to the `transcript` CLI, there is a Go API for users who wish to

internal/core/binary.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package core
2+
3+
import (
4+
"unicode"
5+
"unicode/utf8"
6+
)
7+
8+
// isUnprintable returns true if the rune should be considered unprintable
9+
// for binary detection purposes. This includes invalid UTF-8 sequences
10+
// and control characters, but excludes whitespace.
11+
func isUnprintable(r rune) bool {
12+
return r == utf8.RuneError || (!unicode.IsPrint(r) && !unicode.IsSpace(r))
13+
}
14+
15+
// isBinary determines if data should be treated as binary output.
16+
// Returns true if the data contains null bytes or >10% unprintable characters.
17+
// Properly handles UTF-8 encoded text.
18+
//
19+
// Uses byte-based threshold approximation for performance: we compare unprintable
20+
// rune count against 10% of total byte count. This enables early termination
21+
// when processing large inputs, though it may be slightly inaccurate for text
22+
// with many multi-byte UTF-8 characters (acceptable trade-off for performance).
23+
func isBinary(data []byte) bool {
24+
if len(data) == 0 {
25+
return false
26+
}
27+
28+
// Calculate 10% threshold based on byte count approximation.
29+
// This enables early termination for performance.
30+
threshold := len(data) / 10
31+
unprintable := 0
32+
33+
for len(data) > 0 {
34+
r, size := utf8.DecodeRune(data)
35+
data = data[size:]
36+
37+
// Check for null bytes - instant binary classification.
38+
if r == 0 {
39+
return true
40+
}
41+
42+
// Check if rune is unprintable for binary detection.
43+
if isUnprintable(r) {
44+
unprintable++
45+
// Early exit if we've exceeded the threshold (>10% of bytes).
46+
if unprintable > threshold {
47+
return true
48+
}
49+
}
50+
}
51+
52+
return false
53+
}

internal/core/binary_test.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
package core
2+
3+
import "testing"
4+
5+
func TestIsBinary(t *testing.T) {
6+
tests := []struct {
7+
name string
8+
data []byte
9+
want bool
10+
}{
11+
{
12+
name: "empty data",
13+
data: []byte{},
14+
want: false,
15+
},
16+
{
17+
name: "normal text",
18+
data: []byte("hello world"),
19+
want: false,
20+
},
21+
{
22+
name: "text with newlines",
23+
data: []byte("hello\nworld\n"),
24+
want: false,
25+
},
26+
{
27+
name: "text with null byte",
28+
data: []byte("hello\x00world"),
29+
want: true,
30+
},
31+
{
32+
name: "text with multiple null bytes",
33+
data: []byte("\x00\x00\x00"),
34+
want: true,
35+
},
36+
{
37+
name: "text with high unprintable ratio",
38+
data: []byte("a\x01\x02\x03\x04\x05\x06\x07\x08\x09"), // 9 unprintable out of 10 = 90%
39+
want: true,
40+
},
41+
{
42+
name: "text with low unprintable ratio",
43+
data: []byte("hello world\x01"), // 1 unprintable out of 12 = 8.3%
44+
want: false,
45+
},
46+
{
47+
name: "text exactly at 10% threshold",
48+
data: []byte("abcdefghi\x01"), // 1 unprintable out of 10 = 10%
49+
want: false,
50+
},
51+
{
52+
name: "text just over 10% threshold",
53+
data: []byte("abcdefgh\x01\x02"), // 2 unprintable out of 10 = 20%
54+
want: true,
55+
},
56+
{
57+
name: "single byte printable",
58+
data: []byte("a"),
59+
want: false,
60+
},
61+
{
62+
name: "single byte unprintable",
63+
data: []byte{0x01},
64+
want: true,
65+
},
66+
{
67+
name: "unicode text",
68+
data: []byte("hello 世界"),
69+
want: false,
70+
},
71+
{
72+
name: "invalid utf8",
73+
data: []byte{0x80, 0x81, 0x82},
74+
want: true,
75+
},
76+
{
77+
name: "tab and space characters",
78+
data: []byte("hello\tworld\n"),
79+
want: false,
80+
},
81+
}
82+
83+
for _, tt := range tests {
84+
t.Run(tt.name, func(t *testing.T) {
85+
got := isBinary(tt.data)
86+
if got != tt.want {
87+
t.Errorf("isBinary(%q) = %v, want %v", tt.data, got, tt.want)
88+
}
89+
})
90+
}
91+
}

internal/core/checker.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import (
55
"context"
66
"fmt"
77
"io"
8+
"os"
9+
"strings"
810
)
911

1012
type Checker struct {
@@ -55,6 +57,36 @@ func (ckr *checkHandler) HandleOutput(ctx context.Context, fd int, line string)
5557
return ckr.expectOutput(fmt.Sprintf("%d%s%s", fd, sep, line))
5658
}
5759

60+
func (ckr *checkHandler) HandleFileOutput(ctx context.Context, fd int, filepath string) error {
61+
// Read the expected file content.
62+
expectedData, err := os.ReadFile(filepath)
63+
if err != nil {
64+
return fmt.Errorf("reading expected file %s: %w", filepath, err)
65+
}
66+
67+
// Build the expected output string that would be generated if this was inline.
68+
if isBinary(expectedData) {
69+
// For binary files, we expect the file reference format.
70+
expectedOutput := fmt.Sprintf("%d< %s", fd, filepath)
71+
return ckr.expectOutput(expectedOutput)
72+
} else {
73+
// For text files, we expect the inline format.
74+
var builder strings.Builder
75+
for line := range bytes.Lines(expectedData) {
76+
if len(line) == 1 && line[0] == '\n' {
77+
fmt.Fprintf(&builder, "%d\n", fd)
78+
} else {
79+
fmt.Fprintf(&builder, "%d %s", fd, line)
80+
}
81+
}
82+
// Handle case where original didn't end with newline.
83+
if len(expectedData) > 0 && expectedData[len(expectedData)-1] != '\n' {
84+
builder.WriteString("\n% no-newline\n")
85+
}
86+
return ckr.expectOutput(builder.String())
87+
}
88+
}
89+
5890
func (ckr *checkHandler) HandleNoNewline(ctx context.Context, fd int) error {
5991
// Assumes the previous line contains an already written newline.
6092
// This is also why we can ignore the fd parameter, as it's assumed to

internal/core/errors.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,4 +57,3 @@ var yellow = color.New(color.FgYellow)
5757
var cyan = color.New(color.FgCyan)
5858
var green = color.New(color.FgGreen)
5959
var red = color.New(color.FgRed)
60-

internal/core/interp.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@ type Handler interface {
4040
// Corresponds to cmdt syntax: "1 stdout line" or "2 stderr line".
4141
HandleOutput(ctx context.Context, fd int, line string) error
4242

43+
// HandleFileOutput processes expected output that references an external file.
44+
// The fd parameter indicates the file descriptor: 1 for stdout, 2 for stderr.
45+
// The filepath parameter specifies the file containing the expected output.
46+
// Corresponds to cmdt syntax: "1< filename" or "2< filename".
47+
HandleFileOutput(ctx context.Context, fd int, filepath string) error
48+
4349
// HandleNoNewline indicates that the last output line did not end with a newline.
4450
// The fd parameter indicates which stream (stdout=1, stderr=2) lacks the newline.
4551
// Corresponds to cmdt syntax: "% no-newline".
@@ -99,6 +105,14 @@ func (t *Interpreter) ExecLine(ctx context.Context, text string) error {
99105
t.prevFD = fd
100106
return hdlr.HandleOutput(ctx, fd, payload)
101107

108+
case "1<", "2<":
109+
if !t.acceptResults {
110+
return t.syntaxErrorf("unexpected file output check")
111+
}
112+
fd := int(opcode[0]) - '1' + 1
113+
t.prevFD = fd
114+
return hdlr.HandleFileOutput(ctx, fd, payload)
115+
102116
case "?":
103117
if !t.acceptResults {
104118
return t.syntaxErrorf("unexpected exit status check")

internal/core/recorder.go

Lines changed: 56 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"context"
66
"fmt"
77
"io"
8+
"os"
89
"strings"
910

1011
"mvdan.cc/sh/v3/interp"
@@ -21,10 +22,13 @@ type Recorder struct {
2122
// Transcript captures the recorded output in cmdt format.
2223
Transcript bytes.Buffer
2324

24-
needsBlank bool
25-
runner *interp.Runner
26-
stdoutBuf bytes.Buffer
27-
stderrBuf bytes.Buffer
25+
needsBlank bool
26+
runner *interp.Runner
27+
stdoutBuf bytes.Buffer
28+
stderrBuf bytes.Buffer
29+
fileCount int // Counter for auto-generated binary file names
30+
preferredFiles []string // List of preferred filenames in order (stderr first, then stdout)
31+
fileIndex int // Current position in preferredFiles slice
2832
}
2933

3034
func (rec *Recorder) Init() error {
@@ -34,6 +38,8 @@ func (rec *Recorder) Init() error {
3438
io.MultiWriter(&rec.stdoutBuf, orDiscard(rec.Stdout)),
3539
io.MultiWriter(&rec.stderrBuf, orDiscard(rec.Stderr)),
3640
))
41+
rec.preferredFiles = make([]string, 0)
42+
rec.fileIndex = 0
3743
return err
3844
}
3945

@@ -44,42 +50,78 @@ func orDiscard(w io.Writer) io.Writer {
4450
return w
4551
}
4652

53+
// SetPreferredFiles sets the list of preferred filenames in order.
54+
// Files should be provided in deterministic order (stderr first, then stdout).
55+
func (rec *Recorder) SetPreferredFiles(files []string) {
56+
rec.preferredFiles = make([]string, len(files))
57+
copy(rec.preferredFiles, files)
58+
rec.fileIndex = 0
59+
}
60+
61+
// generateBinaryFilename creates a filename, preferring existing names when available.
62+
// Uses deterministic ordering (stderr first, then stdout) to consume preferred filenames.
63+
func (rec *Recorder) generateBinaryFilename() string {
64+
// Check if we have a preferred filename available.
65+
if rec.fileIndex < len(rec.preferredFiles) {
66+
filename := rec.preferredFiles[rec.fileIndex]
67+
rec.fileIndex++
68+
return filename
69+
}
70+
71+
// Fall back to auto-generated filename.
72+
rec.fileCount++
73+
return fmt.Sprintf("%03d.bin", rec.fileCount)
74+
}
75+
4776
func (rec *Recorder) flush() error {
4877
// Write stderr first (usually empty, text-only, important not to miss).
49-
if err := rec.flushBuffer(&rec.stderrBuf, "2"); err != nil {
78+
if err := rec.flushBuffer(&rec.stderrBuf, 2); err != nil {
5079
return err
5180
}
5281
// Then write stdout.
53-
if err := rec.flushBuffer(&rec.stdoutBuf, "1"); err != nil {
82+
if err := rec.flushBuffer(&rec.stdoutBuf, 1); err != nil {
5483
return err
5584
}
5685
return nil
5786
}
5887

59-
func (rec *Recorder) flushBuffer(buf *bytes.Buffer, prefix string) error {
88+
// flushBuffer processes output from a command and writes it to the transcript.
89+
// Individual command outputs are expected to be reasonably small (not streaming large files).
90+
func (rec *Recorder) flushBuffer(buf *bytes.Buffer, fd int) error {
6091
if buf.Len() == 0 {
6192
return nil
6293
}
63-
94+
6495
data := buf.Bytes()
6596
buf.Reset()
66-
67-
// Add prefix to each line and write to transcript.
97+
98+
// Check if data is binary.
99+
if isBinary(data) {
100+
// Write binary data to file and reference it.
101+
filename := rec.generateBinaryFilename()
102+
if err := os.WriteFile(filename, data, 0644); err != nil {
103+
return fmt.Errorf("writing binary file %q: %w", filename, err)
104+
}
105+
fmt.Fprintf(&rec.Transcript, "%d< %s\n", fd, filename)
106+
return nil
107+
}
108+
109+
// Handle text output - add prefix to each line and write to transcript.
68110
for line := range bytes.Lines(data) {
69111
if len(line) == 1 && line[0] == '\n' {
70112
// Empty line - just prefix.
71-
fmt.Fprintf(&rec.Transcript, "%s\n", prefix)
113+
fmt.Fprintf(&rec.Transcript, "%d\n", fd)
72114
} else {
73115
// Non-empty line - prefix + space + line.
74-
fmt.Fprintf(&rec.Transcript, "%s %s", prefix, line)
116+
fmt.Fprintf(&rec.Transcript, "%d %s", fd, line)
75117
}
76118
}
77-
119+
78120
// Handle case where original didn't end with newline.
79121
if len(data) > 0 && data[len(data)-1] != '\n' {
80122
io.WriteString(&rec.Transcript, "\n% no-newline\n")
81123
}
82-
124+
83125
return nil
84126
}
85127

0 commit comments

Comments
 (0)