标签:
题意:求一个文章(长度5.1e6)里面出现了多少个指定的模式串。重复出现只记一次。而且如果两个模式串都出现的情况下,一个是另一个的子串,则该子串不算出现过。
分析:AC自动机。
由于子串不算所以加一些特殊处理:
1.在文章匹配过程中,如果出现了一个模式串我们不是把匹配数量+1,而是记录那个出现过vis[id] = true;,当然trie树种也是记录了模式串的id。
2.在匹配结束后,我们遍历所有出现过的模式串,在Trie树种找到其所有出现过的子串并将其标为未出现过vis[id] = false;
要如何查找子串呢?
只需要记录每个出现过的串所对应的Trie树中的节点位置,由该节点向上走到root。其间走过的每个节点都沿着fail指针走到root一次。这样二重循环遍历到的所有节点就对应了Trie中所有该模式串的子串。
因为在AC自动机中父节点指针就是找前缀,fail指针就是找后缀。
#include <cstdio> #include <cstring> #include <queue> #include <algorithm> #include <cctype> using namespace std; #define D(x) const int MAX_LEN = (int)(5.1e6) + 10; const int MAX_N = 2505; const int MAX_FINGER_LEN = 1105; const int MAX_CHILD_NUM = 26; const int MAX_NODE_NUM = MAX_N * MAX_FINGER_LEN; int n; char st[MAX_LEN]; char st2[MAX_LEN]; int vis[MAX_N]; bool check[MAX_NODE_NUM]; struct Trie { int next[MAX_NODE_NUM][MAX_CHILD_NUM]; int fail[MAX_NODE_NUM]; int count[MAX_NODE_NUM]; int father[MAX_NODE_NUM]; int node_cnt; int root; void init() { node_cnt = 0; root = newnode(); } int newnode() { for (int i = 0; i < 26; i++) next[node_cnt][i] = -1; count[node_cnt++] = 0; return node_cnt - 1; } int get_id(char a) { return a - ‘A‘; } void insert(char buf[], int index) { int len = strlen(buf); int now = root; for (int i = 0; i < len; i++) { int id = get_id(buf[i]); if (next[now][id] == -1) { next[now][id] = newnode(); father[next[now][id]] = now; } now = next[now][id]; } count[now] = index; } void build() { queue<int>Q; fail[root] = root; father[root] = root; for (int i = 0; i < 26; i++) if (next[root][i] == -1) next[root][i] = root; else { fail[next[root][i]] = root; Q.push(next[root][i]); } while (!Q.empty()) { int now = Q.front(); Q.pop(); for (int i = 0; i < 26; i++) if (next[now][i] == -1) next[now][i] = next[fail[now]][i]; else { fail[next[now][i]]=next[fail[now]][i]; Q.push(next[now][i]); } } } int query(char buf[]) { int now = root; int res = 0; for (int i = 0; buf[i]; i++) { now = next[now][get_id(buf[i])]; int temp = now; while (temp != root && !check[temp]) { if (count[temp] != 0) vis[count[temp]] = temp; check[temp] = true; temp = fail[temp]; } } return res; } void debug() { for(int i = 0;i < node_cnt;i++) { printf("id = %3d,fail = %3d,end = %3d,chi = [",i,fail[i],count[i]); for(int j = 0;j < 26;j++) printf("%2d",next[i][j]); printf("]\n"); } } void cal() { for (int i = 1; i <= n; i++) { if (vis[i] == 0) { continue; } int temp = vis[i]; while (temp != root) { int temp2 = temp; while (temp2 != root && !check[temp2]) { if (count[temp2] != 0 && count[temp2] != i) { vis[count[temp2]] = 0; check[temp2] = true; } temp2 = fail[temp2]; } temp = father[temp]; } } } }; Trie ac; void transform(char st[], char st2[]) { int len = 0; for (int i = 0; st[i]; i++) { if (isupper(st[i])) { st2[len++] = st[i]; continue; } i++; int temp = 0; while (isdigit(st[i])) { temp *= 10; temp += st[i] - ‘0‘; i++; } for (int j = 0; j < temp; j++) { st2[len + j] = st[i]; } len += temp; i++; } st2[len] = 0; } void input() { scanf("%d", &n); for (int i = 1; i <= n; i++) { scanf("%s", st); transform(st, st2); ac.insert(st2, i); } } int work() { memset(vis, 0, sizeof(vis)); memset(check, 0, sizeof(check)); ac.query(st2); memset(check, 0, sizeof(check)); ac.cal(); int ret = 0; for (int i = 1; i <= n; i++) { if (vis[i]) { D(printf("#%d\n", vis[i])); ret++; } } return ret; } int main() { int t; scanf("%d", &t); while (t--) { ac.init(); input(); ac.build(); scanf("%s", st); transform(st, st2); printf("%d\n", work()); } return 0; }
标签:
原文地址:http://www.cnblogs.com/rainydays/p/4331649.html