From 16adef8e7b320a424b7646b8bbf6eb2cf9f34be1 Mon Sep 17 00:00:00 2001
From: JetSetIlly <stephen.t.illingworth@gmail.com>
Date: Wed, 17 Apr 2024 14:17:19 +0100
Subject: [PATCH] fingerprinting limited to first
 cartridgeloader.FingerprintLimit bytes

looking beyond this limit is unlikely to reveal any data of value and it
can only cause excess slowdown for very large files, which are unlikely
to be cartridge files in any case
---
 cartridgeloader/doc.go                   | 40 ++++++++++++++--
 cartridgeloader/loader.go                | 59 ++++++++++++++----------
 hardware/memory/cartridge/fingerprint.go |  6 +--
 hardware/peripherals/fingerprint.go      |  4 +-
 4 files changed, 73 insertions(+), 36 deletions(-)

diff --git a/cartridgeloader/doc.go b/cartridgeloader/doc.go
index 6f1800bc..3811ea8f 100644
--- a/cartridgeloader/doc.go
+++ b/cartridgeloader/doc.go
@@ -53,11 +53,41 @@
 // A file extension of "BIN", "ROM", "A26" indicates that the data should be
 // fingerprinted as normal.
 //
+// # Preloaded data
+//
+// Cartridges with lots of data wil be streamed off disk as required. For
+// example, Moviecart or Supercharge audio tapes can be large and don't need to
+// exist in memory for a very long time.
+//
+// However, for practical reasons the first 1MB of data of any file will be
+// 'preloaded'. When reading cartridge data you don't need to worry about
+// whether data has been preloaded or not, except that it does affect both
+// hashing and fingerprinting.
+//
 // # Hashes
 //
-// Creating a cartridge loader with NewLoaderFromFilename() or
-// NewLoaderFromData() will also create a SHA1 and MD5 hash of the data. The
-// amount of data used to create the has is limited to 1MB. For most cartridges
-// this will mean the hash is taken using all the data but some cartridge are
-// likely to have much more data than that.
+// The creation of a cartridge loader includes the creation of both a SHA1 and
+// an MD5 hash. Hashes are useful for matching cartridges regardless of path
+// or filename
+//
+// The data used to create the hash is limited to the data that has been
+// preloaded (see above).
+//
+// # Fingerprinting
+//
+// Cartridge data can be checked for 'fingerprint' data that can be used to
+// decide on the 'mapping' the cartridge uses. The three cartridge loader
+// functions, Contains(), ContainsLimit() and Count() can be used to search the
+// preloaded data (see above) for specific bytes sequences.
+//
+// More complex fingerprinting can be done with the Read() function. However,
+// because the Read() function works with the complete cartridge and not just
+// the preloaded data, care should be taken not to read too much of the data for
+// reasons of computation time. The constant value FingerprintLimit is provided
+// as a useful value to which a Read() loop can be limited.
+//
+// Once fingerprinting has been completed it is very important to remember to
+// reset the Read() position with the Seek() command:
+//
+//	cartridgeloader.Seek(0, io.SeekStart)
 package cartridgeloader
diff --git a/cartridgeloader/loader.go b/cartridgeloader/loader.go
index a06fdace..e21b1480 100644
--- a/cartridgeloader/loader.go
+++ b/cartridgeloader/loader.go
@@ -31,13 +31,20 @@ import (
 	"github.com/jetsetilly/gopher2600/archivefs"
 )
 
-// the maximum amount of data to load into the peep slice
-const maxPeepLength = 1048576
+// the maximum amount of data to preload
+const maxPreloadLength = 1048576
 
-// makes sures that data is capped at peep length. use this function when
-// assigning to the Loader.peep field
-func peepData(data []byte) []byte {
-	return data[:min(len(data), maxPeepLength)]
+// Fingerprinting beyond the first 64k or so of cartridge data can result in
+// very slow fingerprinting, particular if looking at a large file that is not a
+// cartridge file at all
+//
+// The 64k value is arbitary but in practice it's a sufficiently large value and
+// any data beyond that limit is unlikely to reveal anything of worth
+const FingerprintLimit = 65536
+
+// use this function when assigning to the Loader.preload field
+func preloadLimit(data []byte) []byte {
+	return data[:min(len(data), maxPreloadLength)]
 }
 
 // Loader abstracts all the ways data can be loaded into the emulation.
@@ -72,13 +79,15 @@ type Loader struct {
 	data io.ReadSeeker
 	size int
 
-	// peep is the data at the beginning of the cartridge data. it is used to
-	// help fingerprinting and for creating the SHA1 and MD5 hashes
+	// preload is the data at the beginning of the cartridge data that has been
+	// preloaded immediately on creation of the cartridge loader
 	//
 	// in reality, most cartridges are small enough to fit entirely inside the
-	// peep field. currently it is only moviecart data and supercharger sound
-	// files that are ever arger than maxPeepLength
-	peep []byte
+	// preload field. currently it is only moviecart data and supercharger sound
+	// files that are ever larger than that
+	//
+	// the preload data is used to create the hashes
+	preload []byte
 
 	// data was supplied through NewLoaderFromData()
 	embedded bool
@@ -190,7 +199,7 @@ func NewLoaderFromData(name string, data []byte, mapping string) (Loader, error)
 	ld := Loader{
 		Filename: name,
 		Mapping:  mapping,
-		peep:     peepData(data),
+		preload:  preloadLimit(data),
 		data:     bytes.NewReader(data),
 		HashSHA1: fmt.Sprintf("%x", sha1.Sum(data)),
 		HashMD5:  fmt.Sprintf("%x", md5.Sum(data)),
@@ -232,7 +241,7 @@ func (ld *Loader) Close() error {
 	}
 	ld.data = nil
 	ld.size = 0
-	ld.peep = nil
+	ld.preload = nil
 
 	return nil
 }
@@ -258,21 +267,22 @@ func (ld Loader) Size() int {
 	return ld.size
 }
 
-// Contains returns true if subslice appears anywhere in the peep data
+// Contains returns true if subslice appears anywhere in the preload data.
 func (ld Loader) Contains(subslice []byte) bool {
-	return bytes.Contains(ld.peep, subslice)
+	return bytes.Contains(ld.preload, subslice)
 }
 
-// ContainsLimit returns true if subslice appears in the peep data at an offset between
-// zero and limit
+// ContainsLimit returns true if subslice appears anywhere in the preload data and
+// within the byte limit value supplied as a fuction parameter.
 func (ld Loader) ContainsLimit(limit int, subslice []byte) bool {
 	limit = min(limit, ld.Size())
-	return bytes.Contains(ld.peep[:limit], subslice)
+	return bytes.Contains(ld.preload[:limit], subslice)
 }
 
-// Count returns the number of non-overlapping instances of subslice in the peep data
+// Count returns the number of non-overlapping instances of subslice in the
+// preload data.
 func (ld Loader) Count(subslice []byte) int {
-	return bytes.Count(ld.peep, subslice)
+	return bytes.Count(ld.preload, subslice)
 }
 
 // open the cartridge data. filenames with a valid schema will use that method
@@ -303,7 +313,7 @@ func (ld *Loader) open() error {
 
 		ld.data = bytes.NewReader(data)
 		ld.size = len(data)
-		ld.peep = peepData(data)
+		ld.preload = preloadLimit(data)
 
 	case "file":
 		fallthrough
@@ -314,8 +324,7 @@ func (ld *Loader) open() error {
 			return fmt.Errorf("loader: %w", err)
 		}
 
-		// peep at data
-		ld.peep, err = io.ReadAll(io.LimitReader(r, maxPeepLength))
+		ld.preload, err = io.ReadAll(io.LimitReader(r, maxPreloadLength))
 		if err != nil {
 			return fmt.Errorf("loader: %w", err)
 		}
@@ -328,8 +337,8 @@ func (ld *Loader) open() error {
 	}
 
 	// generate hashes
-	ld.HashSHA1 = fmt.Sprintf("%x", sha1.Sum(ld.peep))
-	ld.HashMD5 = fmt.Sprintf("%x", md5.Sum(ld.peep))
+	ld.HashSHA1 = fmt.Sprintf("%x", sha1.Sum(ld.preload))
+	ld.HashMD5 = fmt.Sprintf("%x", md5.Sum(ld.preload))
 
 	return nil
 }
diff --git a/hardware/memory/cartridge/fingerprint.go b/hardware/memory/cartridge/fingerprint.go
index de04768a..f4e73ba5 100644
--- a/hardware/memory/cartridge/fingerprint.go
+++ b/hardware/memory/cartridge/fingerprint.go
@@ -75,7 +75,7 @@ func (cart *Cartridge) fingerprintPlusROM(loader cartridgeloader.Loader) bool {
 	b := make([]byte, 3)
 	loader.Seek(0, io.SeekStart)
 
-	for {
+	for i := 0; i < cartridgeloader.FingerprintLimit-len(b); i++ {
 		n, err := loader.Read(b)
 		if n < len(b) {
 			break
@@ -148,7 +148,7 @@ func fingerprintMnetwork(loader cartridgeloader.Loader) bool {
 	b := make([]byte, 3)
 	loader.Seek(0, io.SeekStart)
 
-	for {
+	for i := 0; i < cartridgeloader.FingerprintLimit-len(b); i++ {
 		n, err := loader.Read(b)
 		if n < len(b) {
 			break
@@ -277,7 +277,7 @@ func fingerprintCDF(loader cartridgeloader.Loader) (bool, string) {
 	b := make([]byte, 4)
 	loader.Seek(0, io.SeekStart)
 
-	for {
+	for i := 0; i < cartridgeloader.FingerprintLimit-len(b); i++ {
 		n, err := loader.Read(b)
 		if n < len(b) {
 			break
diff --git a/hardware/peripherals/fingerprint.go b/hardware/peripherals/fingerprint.go
index ae922969..748a6bd6 100644
--- a/hardware/peripherals/fingerprint.go
+++ b/hardware/peripherals/fingerprint.go
@@ -75,9 +75,7 @@ func Fingerprint(port plugging.PortID, loader cartridgeloader.Loader) ports.NewP
 
 func matchPattern(patterns [][]byte, loader cartridgeloader.Loader) bool {
 	for _, p := range patterns {
-		// limit check to the first 64k of data. any data beyond that is likely
-		// to be non-program data and only likely to return false-positives
-		if loader.ContainsLimit(65536, p) {
+		if loader.ContainsLimit(cartridgeloader.FingerprintLimit, p) {
 			return true
 		}
 	}