Replace black/white with allow/block. (theresa-m)
This commit is contained in:
parent
c3f3043f7a
commit
8678871f18
@ -38,15 +38,15 @@ The Algorithm
|
|||||||
|
|
||||||
The algorithm works by dividing the set of bytecodes [0..255] into three
|
The algorithm works by dividing the set of bytecodes [0..255] into three
|
||||||
categories:
|
categories:
|
||||||
- The white list of textual bytecodes:
|
- The allow list of textual bytecodes:
|
||||||
9 (TAB), 10 (LF), 13 (CR), 32 (SPACE) to 255.
|
9 (TAB), 10 (LF), 13 (CR), 32 (SPACE) to 255.
|
||||||
- The gray list of tolerated bytecodes:
|
- The gray list of tolerated bytecodes:
|
||||||
7 (BEL), 8 (BS), 11 (VT), 12 (FF), 26 (SUB), 27 (ESC).
|
7 (BEL), 8 (BS), 11 (VT), 12 (FF), 26 (SUB), 27 (ESC).
|
||||||
- The black list of undesired, non-textual bytecodes:
|
- The block list of undesired, non-textual bytecodes:
|
||||||
0 (NUL) to 6, 14 to 31.
|
0 (NUL) to 6, 14 to 31.
|
||||||
|
|
||||||
If a file contains at least one byte that belongs to the white list and
|
If a file contains at least one byte that belongs to the allow list and
|
||||||
no byte that belongs to the black list, then the file is categorized as
|
no byte that belongs to the block list, then the file is categorized as
|
||||||
plain text; otherwise, it is categorized as binary. (The boundary case,
|
plain text; otherwise, it is categorized as binary. (The boundary case,
|
||||||
when the file is empty, automatically falls into the latter category.)
|
when the file is empty, automatically falls into the latter category.)
|
||||||
|
|
||||||
@ -84,9 +84,9 @@ consistent results, regardless what alphabet encoding is being used.
|
|||||||
results on a text encoded, say, using ISO-8859-16 versus UTF-8.)
|
results on a text encoded, say, using ISO-8859-16 versus UTF-8.)
|
||||||
|
|
||||||
There is an extra category of plain text files that are "polluted" with
|
There is an extra category of plain text files that are "polluted" with
|
||||||
one or more black-listed codes, either by mistake or by peculiar design
|
one or more block-listed codes, either by mistake or by peculiar design
|
||||||
considerations. In such cases, a scheme that tolerates a small fraction
|
considerations. In such cases, a scheme that tolerates a small fraction
|
||||||
of black-listed codes would provide an increased recall (i.e. more true
|
of block-listed codes would provide an increased recall (i.e. more true
|
||||||
positives). This, however, incurs a reduced precision overall, since
|
positives). This, however, incurs a reduced precision overall, since
|
||||||
false positives are more likely to appear in binary files that contain
|
false positives are more likely to appear in binary files that contain
|
||||||
large chunks of textual data. Furthermore, "polluted" plain text should
|
large chunks of textual data. Furthermore, "polluted" plain text should
|
||||||
|
18
trees.c
18
trees.c
@ -1091,9 +1091,9 @@ local void compress_block(s, ltree, dtree)
|
|||||||
* Check if the data type is TEXT or BINARY, using the following algorithm:
|
* Check if the data type is TEXT or BINARY, using the following algorithm:
|
||||||
* - TEXT if the two conditions below are satisfied:
|
* - TEXT if the two conditions below are satisfied:
|
||||||
* a) There are no non-portable control characters belonging to the
|
* a) There are no non-portable control characters belonging to the
|
||||||
* "black list" (0..6, 14..25, 28..31).
|
* "block list" (0..6, 14..25, 28..31).
|
||||||
* b) There is at least one printable character belonging to the
|
* b) There is at least one printable character belonging to the
|
||||||
* "white list" (9 {TAB}, 10 {LF}, 13 {CR}, 32..255).
|
* "allow list" (9 {TAB}, 10 {LF}, 13 {CR}, 32..255).
|
||||||
* - BINARY otherwise.
|
* - BINARY otherwise.
|
||||||
* - The following partially-portable control characters form a
|
* - The following partially-portable control characters form a
|
||||||
* "gray list" that is ignored in this detection algorithm:
|
* "gray list" that is ignored in this detection algorithm:
|
||||||
@ -1103,19 +1103,19 @@ local void compress_block(s, ltree, dtree)
|
|||||||
local int detect_data_type(s)
|
local int detect_data_type(s)
|
||||||
deflate_state *s;
|
deflate_state *s;
|
||||||
{
|
{
|
||||||
/* black_mask is the bit mask of black-listed bytes
|
/* block_mask is the bit mask of block-listed bytes
|
||||||
* set bits 0..6, 14..25, and 28..31
|
* set bits 0..6, 14..25, and 28..31
|
||||||
* 0xf3ffc07f = binary 11110011111111111100000001111111
|
* 0xf3ffc07f = binary 11110011111111111100000001111111
|
||||||
*/
|
*/
|
||||||
unsigned long black_mask = 0xf3ffc07fUL;
|
unsigned long block_mask = 0xf3ffc07fUL;
|
||||||
int n;
|
int n;
|
||||||
|
|
||||||
/* Check for non-textual ("black-listed") bytes. */
|
/* Check for non-textual ("block-listed") bytes. */
|
||||||
for (n = 0; n <= 31; n++, black_mask >>= 1)
|
for (n = 0; n <= 31; n++, block_mask >>= 1)
|
||||||
if ((black_mask & 1) && (s->dyn_ltree[n].Freq != 0))
|
if ((block_mask & 1) && (s->dyn_ltree[n].Freq != 0))
|
||||||
return Z_BINARY;
|
return Z_BINARY;
|
||||||
|
|
||||||
/* Check for textual ("white-listed") bytes. */
|
/* Check for textual ("allow-listed") bytes. */
|
||||||
if (s->dyn_ltree[9].Freq != 0 || s->dyn_ltree[10].Freq != 0
|
if (s->dyn_ltree[9].Freq != 0 || s->dyn_ltree[10].Freq != 0
|
||||||
|| s->dyn_ltree[13].Freq != 0)
|
|| s->dyn_ltree[13].Freq != 0)
|
||||||
return Z_TEXT;
|
return Z_TEXT;
|
||||||
@ -1123,7 +1123,7 @@ local int detect_data_type(s)
|
|||||||
if (s->dyn_ltree[n].Freq != 0)
|
if (s->dyn_ltree[n].Freq != 0)
|
||||||
return Z_TEXT;
|
return Z_TEXT;
|
||||||
|
|
||||||
/* There are no "black-listed" or "white-listed" bytes:
|
/* There are no "block-listed" or "allow-listed" bytes:
|
||||||
* this stream either is empty or has tolerated ("gray-listed") bytes only.
|
* this stream either is empty or has tolerated ("gray-listed") bytes only.
|
||||||
*/
|
*/
|
||||||
return Z_BINARY;
|
return Z_BINARY;
|
||||||
|
Loading…
Reference in New Issue
Block a user