2020-11-09 05:23:58 +01:00
|
|
|
/* SPDX-License-Identifier: LGPL-2.1-or-later */
|
2012-10-22 16:27:00 +02:00
|
|
|
|
2015-11-30 21:43:37 +01:00
|
|
|
#include <errno.h>
|
2012-10-22 16:27:00 +02:00
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
2015-10-27 03:01:06 +01:00
|
|
|
#include "alloc-util.h"
|
2019-03-13 12:14:47 +01:00
|
|
|
#include "sort-util.h"
|
2012-10-22 16:27:00 +02:00
|
|
|
#include "strbuf.h"
|
|
|
|
|
2012-10-22 16:54:14 +02:00
|
|
|
/*
|
2013-04-15 04:37:54 +02:00
|
|
|
* Strbuf stores given strings in a single continuous allocated memory
|
2012-10-22 18:21:12 +02:00
|
|
|
* area. Identical strings are de-duplicated and return the same offset
|
|
|
|
* as the first string stored. If the tail of a string already exists
|
|
|
|
* in the buffer, the tail is returned.
|
2012-10-22 16:54:14 +02:00
|
|
|
*
|
2012-10-22 18:21:12 +02:00
|
|
|
* A trie (http://en.wikipedia.org/wiki/Trie) is used to maintain the
|
|
|
|
* information about the stored strings.
|
2012-10-22 16:54:14 +02:00
|
|
|
*
|
|
|
|
* Example of udev rules:
|
|
|
|
* $ ./udevadm test .
|
|
|
|
* ...
|
|
|
|
* read rules file: /usr/lib/udev/rules.d/99-systemd.rules
|
|
|
|
* rules contain 196608 bytes tokens (16384 * 12 bytes), 39742 bytes strings
|
|
|
|
* 23939 strings (207859 bytes), 20404 de-duplicated (171653 bytes), 3536 trie nodes used
|
|
|
|
* ...
|
|
|
|
*/
|
|
|
|
|
2012-10-22 16:27:00 +02:00
|
|
|
struct strbuf *strbuf_new(void) {
|
|
|
|
struct strbuf *str;
|
|
|
|
|
2018-03-26 10:26:39 +02:00
|
|
|
str = new(struct strbuf, 1);
|
2012-10-22 16:27:00 +02:00
|
|
|
if (!str)
|
|
|
|
return NULL;
|
2018-03-26 10:26:39 +02:00
|
|
|
*str = (struct strbuf) {
|
|
|
|
.buf = new0(char, 1),
|
|
|
|
.root = new0(struct strbuf_node, 1),
|
|
|
|
.len = 1,
|
|
|
|
.nodes_count = 1,
|
|
|
|
};
|
|
|
|
if (!str->buf || !str->root) {
|
|
|
|
free(str->buf);
|
|
|
|
free(str->root);
|
|
|
|
return mfree(str);
|
|
|
|
}
|
2012-10-22 16:27:00 +02:00
|
|
|
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
2018-03-26 10:26:39 +02:00
|
|
|
static struct strbuf_node* strbuf_node_cleanup(struct strbuf_node *node) {
|
2012-10-22 16:27:00 +02:00
|
|
|
size_t i;
|
|
|
|
|
|
|
|
for (i = 0; i < node->children_count; i++)
|
|
|
|
strbuf_node_cleanup(node->children[i].child);
|
|
|
|
free(node->children);
|
2018-03-26 10:26:39 +02:00
|
|
|
return mfree(node);
|
2012-10-22 16:27:00 +02:00
|
|
|
}
|
|
|
|
|
2012-10-22 16:54:14 +02:00
|
|
|
/* clean up trie data, leave only the string buffer */
|
2012-10-22 16:27:00 +02:00
|
|
|
void strbuf_complete(struct strbuf *str) {
|
|
|
|
if (!str)
|
|
|
|
return;
|
|
|
|
if (str->root)
|
2018-03-26 10:26:39 +02:00
|
|
|
str->root = strbuf_node_cleanup(str->root);
|
2012-10-22 16:27:00 +02:00
|
|
|
}
|
|
|
|
|
2012-10-22 16:54:14 +02:00
|
|
|
/* clean up everything */
|
2012-10-22 16:27:00 +02:00
|
|
|
void strbuf_cleanup(struct strbuf *str) {
|
2018-08-22 14:50:50 +02:00
|
|
|
if (!str)
|
|
|
|
return;
|
|
|
|
|
2018-03-26 10:26:39 +02:00
|
|
|
strbuf_complete(str);
|
2012-10-22 16:27:00 +02:00
|
|
|
free(str->buf);
|
|
|
|
free(str);
|
|
|
|
}
|
|
|
|
|
2013-03-31 04:12:56 +02:00
|
|
|
static int strbuf_children_cmp(const struct strbuf_child_entry *n1,
|
|
|
|
const struct strbuf_child_entry *n2) {
|
2012-10-22 16:27:00 +02:00
|
|
|
return n1->c - n2->c;
|
|
|
|
}
|
|
|
|
|
2013-03-31 04:12:56 +02:00
|
|
|
static void bubbleinsert(struct strbuf_node *node,
|
|
|
|
uint8_t c,
|
|
|
|
struct strbuf_node *node_child) {
|
|
|
|
|
|
|
|
struct strbuf_child_entry new = {
|
|
|
|
.c = c,
|
|
|
|
.child = node_child,
|
|
|
|
};
|
|
|
|
int left = 0, right = node->children_count;
|
|
|
|
|
|
|
|
while (right > left) {
|
|
|
|
int middle = (right + left) / 2 ;
|
|
|
|
if (strbuf_children_cmp(&node->children[middle], &new) <= 0)
|
|
|
|
left = middle + 1;
|
|
|
|
else
|
|
|
|
right = middle;
|
|
|
|
}
|
|
|
|
|
|
|
|
memmove(node->children + left + 1, node->children + left,
|
|
|
|
sizeof(struct strbuf_child_entry) * (node->children_count - left));
|
|
|
|
node->children[left] = new;
|
|
|
|
|
2016-02-23 05:32:04 +01:00
|
|
|
node->children_count++;
|
2013-03-31 04:12:56 +02:00
|
|
|
}
|
|
|
|
|
2012-10-22 16:54:14 +02:00
|
|
|
/* add string, return the index/offset into the buffer */
|
2012-10-22 16:27:00 +02:00
|
|
|
ssize_t strbuf_add_string(struct strbuf *str, const char *s, size_t len) {
|
|
|
|
uint8_t c;
|
|
|
|
struct strbuf_node *node;
|
|
|
|
size_t depth;
|
|
|
|
char *buf_new;
|
|
|
|
struct strbuf_child_entry *child;
|
|
|
|
struct strbuf_node *node_child;
|
|
|
|
ssize_t off;
|
|
|
|
|
|
|
|
if (!str->root)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* search string; start from last character to find possibly matching tails */
|
2018-03-26 10:32:42 +02:00
|
|
|
|
2012-10-22 16:27:00 +02:00
|
|
|
str->in_count++;
|
2018-03-26 10:32:42 +02:00
|
|
|
if (len == 0) {
|
|
|
|
str->dedup_count++;
|
|
|
|
return 0;
|
|
|
|
}
|
2012-10-22 16:27:00 +02:00
|
|
|
str->in_len += len;
|
|
|
|
|
|
|
|
node = str->root;
|
|
|
|
for (depth = 0; depth <= len; depth++) {
|
|
|
|
struct strbuf_child_entry search;
|
|
|
|
|
|
|
|
/* match against current node */
|
|
|
|
off = node->value_off + node->value_len - len;
|
|
|
|
if (depth == len || (node->value_len >= len && memcmp(str->buf + off, s, len) == 0)) {
|
|
|
|
str->dedup_len += len;
|
|
|
|
str->dedup_count++;
|
|
|
|
return off;
|
|
|
|
}
|
|
|
|
|
strbuf: set the proper character when creating new nodes
Commit 82501b3fc added an early break when a terminal node is found to
incorrect place -- before setting c. This caused trie to be built that
does not correspond to what it points to in buffer, causing incorrect
deduplications:
# cat /etc/udev/rules.d/99-bug.rules
ENV{FOO}=="0"
ENV{xx0}=="BAR"
ENV{BAZ}=="00"
# udevadm test
* RULE /etc/udev/rules.d/99-bug.rules:1, token: 0, count: 2, label: ''
M ENV match 'FOO' '0'(plain)
* RULE /etc/udev/rules.d/99-bug.rules:2, token: 2, count: 2, label: ''
M ENV match 'xx0' 'BAR'(plain)
* RULE /etc/udev/rules.d/99-bug.rules:3, token: 4, count: 2, label: ''
M ENV match 'BAZ' 'x0'(plain)
* END
The addition of "xx0" following "0" will cause a trie like this to be
created:
c=\0
c=0 "0"
c=0 "xx0" <-- note the c is incorrect here, causing "00" to be
c=O "FOO" deduplicated to it
c=R "BAR"
This in effect caused the usb_modeswitch rule for Huawei modems to never
match and this never be switched to serial mode from mass storage.
2016-05-03 22:15:49 +02:00
|
|
|
c = s[len - 1 - depth];
|
|
|
|
|
2012-10-22 16:27:00 +02:00
|
|
|
/* lookup child node */
|
|
|
|
search.c = c;
|
2018-09-18 04:08:23 +02:00
|
|
|
child = typesafe_bsearch(&search, node->children, node->children_count, strbuf_children_cmp);
|
2012-10-22 16:27:00 +02:00
|
|
|
if (!child)
|
|
|
|
break;
|
|
|
|
node = child->child;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* add new string */
|
|
|
|
buf_new = realloc(str->buf, str->len + len+1);
|
|
|
|
if (!buf_new)
|
|
|
|
return -ENOMEM;
|
|
|
|
str->buf = buf_new;
|
|
|
|
off = str->len;
|
|
|
|
memcpy(str->buf + off, s, len);
|
|
|
|
str->len += len;
|
|
|
|
str->buf[str->len++] = '\0';
|
|
|
|
|
|
|
|
/* new node */
|
2018-03-26 10:26:39 +02:00
|
|
|
node_child = new(struct strbuf_node, 1);
|
2012-10-22 16:27:00 +02:00
|
|
|
if (!node_child)
|
|
|
|
return -ENOMEM;
|
2018-03-26 10:26:39 +02:00
|
|
|
*node_child = (struct strbuf_node) {
|
|
|
|
.value_off = off,
|
|
|
|
.value_len = len,
|
|
|
|
};
|
2012-10-22 16:27:00 +02:00
|
|
|
|
|
|
|
/* extend array, add new entry, sort for bisection */
|
2018-02-27 19:09:22 +01:00
|
|
|
child = reallocarray(node->children, node->children_count + 1, sizeof(struct strbuf_child_entry));
|
2013-03-31 01:32:56 +01:00
|
|
|
if (!child) {
|
|
|
|
free(node_child);
|
2012-10-22 16:27:00 +02:00
|
|
|
return -ENOMEM;
|
2013-03-31 01:32:56 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
str->nodes_count++;
|
|
|
|
|
2012-10-22 16:27:00 +02:00
|
|
|
node->children = child;
|
2013-03-31 04:12:56 +02:00
|
|
|
bubbleinsert(node, c, node_child);
|
2012-10-22 16:27:00 +02:00
|
|
|
|
|
|
|
return off;
|
|
|
|
}
|