fingerprinting limited to first cartridgeloader.FingerprintLimit bytes

looking beyond this limit is unlikely to reveal any data of value and it
can only cause excess slowdown for very large files, which are unlikely
to be cartridge files in any case
This commit is contained in:
JetSetIlly 2024-04-17 14:17:19 +01:00
parent 1850b03ff5
commit 16adef8e7b
4 changed files with 73 additions and 36 deletions

View file

@ -53,11 +53,41 @@
// A file extension of "BIN", "ROM", "A26" indicates that the data should be
// fingerprinted as normal.
//
// # Preloaded data
//
// Cartridges with lots of data wil be streamed off disk as required. For
// example, Moviecart or Supercharge audio tapes can be large and don't need to
// exist in memory for a very long time.
//
// However, for practical reasons the first 1MB of data of any file will be
// 'preloaded'. When reading cartridge data you don't need to worry about
// whether data has been preloaded or not, except that it does affect both
// hashing and fingerprinting.
//
// # Hashes
//
// Creating a cartridge loader with NewLoaderFromFilename() or
// NewLoaderFromData() will also create a SHA1 and MD5 hash of the data. The
// amount of data used to create the has is limited to 1MB. For most cartridges
// this will mean the hash is taken using all the data but some cartridge are
// likely to have much more data than that.
// The creation of a cartridge loader includes the creation of both a SHA1 and
// an MD5 hash. Hashes are useful for matching cartridges regardless of path
// or filename
//
// The data used to create the hash is limited to the data that has been
// preloaded (see above).
//
// # Fingerprinting
//
// Cartridge data can be checked for 'fingerprint' data that can be used to
// decide on the 'mapping' the cartridge uses. The three cartridge loader
// functions, Contains(), ContainsLimit() and Count() can be used to search the
// preloaded data (see above) for specific bytes sequences.
//
// More complex fingerprinting can be done with the Read() function. However,
// because the Read() function works with the complete cartridge and not just
// the preloaded data, care should be taken not to read too much of the data for
// reasons of computation time. The constant value FingerprintLimit is provided
// as a useful value to which a Read() loop can be limited.
//
// Once fingerprinting has been completed it is very important to remember to
// reset the Read() position with the Seek() command:
//
// cartridgeloader.Seek(0, io.SeekStart)
package cartridgeloader

View file

@ -31,13 +31,20 @@ import (
"github.com/jetsetilly/gopher2600/archivefs"
)
// the maximum amount of data to load into the peep slice
const maxPeepLength = 1048576
// the maximum amount of data to preload
const maxPreloadLength = 1048576
// makes sures that data is capped at peep length. use this function when
// assigning to the Loader.peep field
func peepData(data []byte) []byte {
return data[:min(len(data), maxPeepLength)]
// Fingerprinting beyond the first 64k or so of cartridge data can result in
// very slow fingerprinting, particular if looking at a large file that is not a
// cartridge file at all
//
// The 64k value is arbitary but in practice it's a sufficiently large value and
// any data beyond that limit is unlikely to reveal anything of worth
const FingerprintLimit = 65536
// use this function when assigning to the Loader.preload field
func preloadLimit(data []byte) []byte {
return data[:min(len(data), maxPreloadLength)]
}
// Loader abstracts all the ways data can be loaded into the emulation.
@ -72,13 +79,15 @@ type Loader struct {
data io.ReadSeeker
size int
// peep is the data at the beginning of the cartridge data. it is used to
// help fingerprinting and for creating the SHA1 and MD5 hashes
// preload is the data at the beginning of the cartridge data that has been
// preloaded immediately on creation of the cartridge loader
//
// in reality, most cartridges are small enough to fit entirely inside the
// peep field. currently it is only moviecart data and supercharger sound
// files that are ever arger than maxPeepLength
peep []byte
// preload field. currently it is only moviecart data and supercharger sound
// files that are ever larger than that
//
// the preload data is used to create the hashes
preload []byte
// data was supplied through NewLoaderFromData()
embedded bool
@ -190,7 +199,7 @@ func NewLoaderFromData(name string, data []byte, mapping string) (Loader, error)
ld := Loader{
Filename: name,
Mapping: mapping,
peep: peepData(data),
preload: preloadLimit(data),
data: bytes.NewReader(data),
HashSHA1: fmt.Sprintf("%x", sha1.Sum(data)),
HashMD5: fmt.Sprintf("%x", md5.Sum(data)),
@ -232,7 +241,7 @@ func (ld *Loader) Close() error {
}
ld.data = nil
ld.size = 0
ld.peep = nil
ld.preload = nil
return nil
}
@ -258,21 +267,22 @@ func (ld Loader) Size() int {
return ld.size
}
// Contains returns true if subslice appears anywhere in the peep data
// Contains returns true if subslice appears anywhere in the preload data.
func (ld Loader) Contains(subslice []byte) bool {
return bytes.Contains(ld.peep, subslice)
return bytes.Contains(ld.preload, subslice)
}
// ContainsLimit returns true if subslice appears in the peep data at an offset between
// zero and limit
// ContainsLimit returns true if subslice appears anywhere in the preload data and
// within the byte limit value supplied as a fuction parameter.
func (ld Loader) ContainsLimit(limit int, subslice []byte) bool {
limit = min(limit, ld.Size())
return bytes.Contains(ld.peep[:limit], subslice)
return bytes.Contains(ld.preload[:limit], subslice)
}
// Count returns the number of non-overlapping instances of subslice in the peep data
// Count returns the number of non-overlapping instances of subslice in the
// preload data.
func (ld Loader) Count(subslice []byte) int {
return bytes.Count(ld.peep, subslice)
return bytes.Count(ld.preload, subslice)
}
// open the cartridge data. filenames with a valid schema will use that method
@ -303,7 +313,7 @@ func (ld *Loader) open() error {
ld.data = bytes.NewReader(data)
ld.size = len(data)
ld.peep = peepData(data)
ld.preload = preloadLimit(data)
case "file":
fallthrough
@ -314,8 +324,7 @@ func (ld *Loader) open() error {
return fmt.Errorf("loader: %w", err)
}
// peep at data
ld.peep, err = io.ReadAll(io.LimitReader(r, maxPeepLength))
ld.preload, err = io.ReadAll(io.LimitReader(r, maxPreloadLength))
if err != nil {
return fmt.Errorf("loader: %w", err)
}
@ -328,8 +337,8 @@ func (ld *Loader) open() error {
}
// generate hashes
ld.HashSHA1 = fmt.Sprintf("%x", sha1.Sum(ld.peep))
ld.HashMD5 = fmt.Sprintf("%x", md5.Sum(ld.peep))
ld.HashSHA1 = fmt.Sprintf("%x", sha1.Sum(ld.preload))
ld.HashMD5 = fmt.Sprintf("%x", md5.Sum(ld.preload))
return nil
}

View file

@ -75,7 +75,7 @@ func (cart *Cartridge) fingerprintPlusROM(loader cartridgeloader.Loader) bool {
b := make([]byte, 3)
loader.Seek(0, io.SeekStart)
for {
for i := 0; i < cartridgeloader.FingerprintLimit-len(b); i++ {
n, err := loader.Read(b)
if n < len(b) {
break
@ -148,7 +148,7 @@ func fingerprintMnetwork(loader cartridgeloader.Loader) bool {
b := make([]byte, 3)
loader.Seek(0, io.SeekStart)
for {
for i := 0; i < cartridgeloader.FingerprintLimit-len(b); i++ {
n, err := loader.Read(b)
if n < len(b) {
break
@ -277,7 +277,7 @@ func fingerprintCDF(loader cartridgeloader.Loader) (bool, string) {
b := make([]byte, 4)
loader.Seek(0, io.SeekStart)
for {
for i := 0; i < cartridgeloader.FingerprintLimit-len(b); i++ {
n, err := loader.Read(b)
if n < len(b) {
break

View file

@ -75,9 +75,7 @@ func Fingerprint(port plugging.PortID, loader cartridgeloader.Loader) ports.NewP
func matchPattern(patterns [][]byte, loader cartridgeloader.Loader) bool {
for _, p := range patterns {
// limit check to the first 64k of data. any data beyond that is likely
// to be non-program data and only likely to return false-positives
if loader.ContainsLimit(65536, p) {
if loader.ContainsLimit(cartridgeloader.FingerprintLimit, p) {
return true
}
}