fingerprinting limited to first cartridgeloader.FingerprintLimit bytes

looking beyond this limit is unlikely to reveal any data of value and it
can only cause excess slowdown for very large files, which are unlikely
to be cartridge files in any case
This commit is contained in:
JetSetIlly 2024-04-17 14:17:19 +01:00
parent 1850b03ff5
commit 16adef8e7b
4 changed files with 73 additions and 36 deletions

View file

@ -53,11 +53,41 @@
// A file extension of "BIN", "ROM", "A26" indicates that the data should be // A file extension of "BIN", "ROM", "A26" indicates that the data should be
// fingerprinted as normal. // fingerprinted as normal.
// //
// # Preloaded data
//
// Cartridges with lots of data wil be streamed off disk as required. For
// example, Moviecart or Supercharge audio tapes can be large and don't need to
// exist in memory for a very long time.
//
// However, for practical reasons the first 1MB of data of any file will be
// 'preloaded'. When reading cartridge data you don't need to worry about
// whether data has been preloaded or not, except that it does affect both
// hashing and fingerprinting.
//
// # Hashes // # Hashes
// //
// Creating a cartridge loader with NewLoaderFromFilename() or // The creation of a cartridge loader includes the creation of both a SHA1 and
// NewLoaderFromData() will also create a SHA1 and MD5 hash of the data. The // an MD5 hash. Hashes are useful for matching cartridges regardless of path
// amount of data used to create the has is limited to 1MB. For most cartridges // or filename
// this will mean the hash is taken using all the data but some cartridge are //
// likely to have much more data than that. // The data used to create the hash is limited to the data that has been
// preloaded (see above).
//
// # Fingerprinting
//
// Cartridge data can be checked for 'fingerprint' data that can be used to
// decide on the 'mapping' the cartridge uses. The three cartridge loader
// functions, Contains(), ContainsLimit() and Count() can be used to search the
// preloaded data (see above) for specific bytes sequences.
//
// More complex fingerprinting can be done with the Read() function. However,
// because the Read() function works with the complete cartridge and not just
// the preloaded data, care should be taken not to read too much of the data for
// reasons of computation time. The constant value FingerprintLimit is provided
// as a useful value to which a Read() loop can be limited.
//
// Once fingerprinting has been completed it is very important to remember to
// reset the Read() position with the Seek() command:
//
// cartridgeloader.Seek(0, io.SeekStart)
package cartridgeloader package cartridgeloader

View file

@ -31,13 +31,20 @@ import (
"github.com/jetsetilly/gopher2600/archivefs" "github.com/jetsetilly/gopher2600/archivefs"
) )
// the maximum amount of data to load into the peep slice // the maximum amount of data to preload
const maxPeepLength = 1048576 const maxPreloadLength = 1048576
// makes sures that data is capped at peep length. use this function when // Fingerprinting beyond the first 64k or so of cartridge data can result in
// assigning to the Loader.peep field // very slow fingerprinting, particular if looking at a large file that is not a
func peepData(data []byte) []byte { // cartridge file at all
return data[:min(len(data), maxPeepLength)] //
// The 64k value is arbitary but in practice it's a sufficiently large value and
// any data beyond that limit is unlikely to reveal anything of worth
const FingerprintLimit = 65536
// use this function when assigning to the Loader.preload field
func preloadLimit(data []byte) []byte {
return data[:min(len(data), maxPreloadLength)]
} }
// Loader abstracts all the ways data can be loaded into the emulation. // Loader abstracts all the ways data can be loaded into the emulation.
@ -72,13 +79,15 @@ type Loader struct {
data io.ReadSeeker data io.ReadSeeker
size int size int
// peep is the data at the beginning of the cartridge data. it is used to // preload is the data at the beginning of the cartridge data that has been
// help fingerprinting and for creating the SHA1 and MD5 hashes // preloaded immediately on creation of the cartridge loader
// //
// in reality, most cartridges are small enough to fit entirely inside the // in reality, most cartridges are small enough to fit entirely inside the
// peep field. currently it is only moviecart data and supercharger sound // preload field. currently it is only moviecart data and supercharger sound
// files that are ever arger than maxPeepLength // files that are ever larger than that
peep []byte //
// the preload data is used to create the hashes
preload []byte
// data was supplied through NewLoaderFromData() // data was supplied through NewLoaderFromData()
embedded bool embedded bool
@ -190,7 +199,7 @@ func NewLoaderFromData(name string, data []byte, mapping string) (Loader, error)
ld := Loader{ ld := Loader{
Filename: name, Filename: name,
Mapping: mapping, Mapping: mapping,
peep: peepData(data), preload: preloadLimit(data),
data: bytes.NewReader(data), data: bytes.NewReader(data),
HashSHA1: fmt.Sprintf("%x", sha1.Sum(data)), HashSHA1: fmt.Sprintf("%x", sha1.Sum(data)),
HashMD5: fmt.Sprintf("%x", md5.Sum(data)), HashMD5: fmt.Sprintf("%x", md5.Sum(data)),
@ -232,7 +241,7 @@ func (ld *Loader) Close() error {
} }
ld.data = nil ld.data = nil
ld.size = 0 ld.size = 0
ld.peep = nil ld.preload = nil
return nil return nil
} }
@ -258,21 +267,22 @@ func (ld Loader) Size() int {
return ld.size return ld.size
} }
// Contains returns true if subslice appears anywhere in the peep data // Contains returns true if subslice appears anywhere in the preload data.
func (ld Loader) Contains(subslice []byte) bool { func (ld Loader) Contains(subslice []byte) bool {
return bytes.Contains(ld.peep, subslice) return bytes.Contains(ld.preload, subslice)
} }
// ContainsLimit returns true if subslice appears in the peep data at an offset between // ContainsLimit returns true if subslice appears anywhere in the preload data and
// zero and limit // within the byte limit value supplied as a fuction parameter.
func (ld Loader) ContainsLimit(limit int, subslice []byte) bool { func (ld Loader) ContainsLimit(limit int, subslice []byte) bool {
limit = min(limit, ld.Size()) limit = min(limit, ld.Size())
return bytes.Contains(ld.peep[:limit], subslice) return bytes.Contains(ld.preload[:limit], subslice)
} }
// Count returns the number of non-overlapping instances of subslice in the peep data // Count returns the number of non-overlapping instances of subslice in the
// preload data.
func (ld Loader) Count(subslice []byte) int { func (ld Loader) Count(subslice []byte) int {
return bytes.Count(ld.peep, subslice) return bytes.Count(ld.preload, subslice)
} }
// open the cartridge data. filenames with a valid schema will use that method // open the cartridge data. filenames with a valid schema will use that method
@ -303,7 +313,7 @@ func (ld *Loader) open() error {
ld.data = bytes.NewReader(data) ld.data = bytes.NewReader(data)
ld.size = len(data) ld.size = len(data)
ld.peep = peepData(data) ld.preload = preloadLimit(data)
case "file": case "file":
fallthrough fallthrough
@ -314,8 +324,7 @@ func (ld *Loader) open() error {
return fmt.Errorf("loader: %w", err) return fmt.Errorf("loader: %w", err)
} }
// peep at data ld.preload, err = io.ReadAll(io.LimitReader(r, maxPreloadLength))
ld.peep, err = io.ReadAll(io.LimitReader(r, maxPeepLength))
if err != nil { if err != nil {
return fmt.Errorf("loader: %w", err) return fmt.Errorf("loader: %w", err)
} }
@ -328,8 +337,8 @@ func (ld *Loader) open() error {
} }
// generate hashes // generate hashes
ld.HashSHA1 = fmt.Sprintf("%x", sha1.Sum(ld.peep)) ld.HashSHA1 = fmt.Sprintf("%x", sha1.Sum(ld.preload))
ld.HashMD5 = fmt.Sprintf("%x", md5.Sum(ld.peep)) ld.HashMD5 = fmt.Sprintf("%x", md5.Sum(ld.preload))
return nil return nil
} }

View file

@ -75,7 +75,7 @@ func (cart *Cartridge) fingerprintPlusROM(loader cartridgeloader.Loader) bool {
b := make([]byte, 3) b := make([]byte, 3)
loader.Seek(0, io.SeekStart) loader.Seek(0, io.SeekStart)
for { for i := 0; i < cartridgeloader.FingerprintLimit-len(b); i++ {
n, err := loader.Read(b) n, err := loader.Read(b)
if n < len(b) { if n < len(b) {
break break
@ -148,7 +148,7 @@ func fingerprintMnetwork(loader cartridgeloader.Loader) bool {
b := make([]byte, 3) b := make([]byte, 3)
loader.Seek(0, io.SeekStart) loader.Seek(0, io.SeekStart)
for { for i := 0; i < cartridgeloader.FingerprintLimit-len(b); i++ {
n, err := loader.Read(b) n, err := loader.Read(b)
if n < len(b) { if n < len(b) {
break break
@ -277,7 +277,7 @@ func fingerprintCDF(loader cartridgeloader.Loader) (bool, string) {
b := make([]byte, 4) b := make([]byte, 4)
loader.Seek(0, io.SeekStart) loader.Seek(0, io.SeekStart)
for { for i := 0; i < cartridgeloader.FingerprintLimit-len(b); i++ {
n, err := loader.Read(b) n, err := loader.Read(b)
if n < len(b) { if n < len(b) {
break break

View file

@ -75,9 +75,7 @@ func Fingerprint(port plugging.PortID, loader cartridgeloader.Loader) ports.NewP
func matchPattern(patterns [][]byte, loader cartridgeloader.Loader) bool { func matchPattern(patterns [][]byte, loader cartridgeloader.Loader) bool {
for _, p := range patterns { for _, p := range patterns {
// limit check to the first 64k of data. any data beyond that is likely if loader.ContainsLimit(cartridgeloader.FingerprintLimit, p) {
// to be non-program data and only likely to return false-positives
if loader.ContainsLimit(65536, p) {
return true return true
} }
} }