暂时嘛,啥也没有
蒟蒻博主啥也不会捏
会了也不告诉你
bug
这里是大家可能会遇到的一些问题,我分享一些我的想法
读题
要注意理解题目意思。大致就是首先统计article的词频。接着得到单词中非停用词的词频最多的前N个。(要注意不是每个网页的前N个,而是将所有article的网页的词频统计好之后的前N个(这个也就是特征向量)。然后通过题目描述的计算方式得到指纹。然后对每个sample的网页,同样统计特征向量中每个单词(注意这里变成了特征向量中的单词,而不需要所有的)的词频,通过题目描述的计算方式再次得到指纹。最后将每一个sample网页的指纹与所有article的指纹进行汉明距离的计算,然后统计输出。
第一种,本地不对
本地不管是输出错误还是运行时错误,都可以采取打印调试法和逐步调试法。(debug是必修课喔~~)
第二种,本地对小数据不对
嗯,这种情况其实到后面可以向助教祈求小数据的内容。
首先可能是windows与linux的差异。
也就是linux中每一行结束时\r\n,而windows是\n。
可以将文件打开Mode改为rb,看本地输出是否正确。
第三种,小数据对大数据不对
这一种是最恼火的
如果是输出错误,
- 千万要注意article和sample的网页名。如果是直接以为就是样例中的1-%d这种的,那么就会出问题。建议改为从数组中解析或者fscanf。
如果是运行时错误,
多半是数组开小了,或者malloc多了。
- 这种情况下关键是找到哪里运行时错误了。可以在代码的不同位置return 0强制结束然。后看在哪个之前显示的是输出错误,过了那个就变成了运行时错误,那么就仔细看看应该如何解决。
题目
code
// N 5000
// M 64
#include
#include
#pragma GCC optimize("Ofast")
#pragma once
#pragma pack (16)
#define low(c) ((c>=65&&c<=90 )? 32+c:c)
//#define low(c) (c|0x20)
#define NMAX 10005
#define MMAX 130
#define MAXHASH 1280010
char bufferHash[MAXHASH];
#define MAXSTOPCHARS 10000
char bufferStopChars[MAXSTOPCHARS];
#define MAXARTICLECHARS 350000000
char bufferArticleChars[MAXARTICLECHARS];
#define PAGENUMMAX 16000
int hash_better[MMAX][NMAX];
int N, M;
int myatoi(char s[]) {
int i = 0;
int tmp = 0;
while (s[i] != '\0') {
if (s[i] >= 48 && s[i] <= 57) {
tmp = tmp * 10 + s[i] - '0';
i++;
} else {
return 0;
}
}
return tmp;
}
int mystrcmp(char *s1, char *s2) {
char *p1 = s1;
char *p2 = s2;
while (*p1 && *p2) {
if (*p1 != *p2) {
return *p1 - *p2;
}
p1++;
p2++;
}
if (*p1) {
return 1;
} else if (*p2) {
return -1;
} else {
return 0;
}
}
void mystrcpy(char *s1, const char *s2) {
int i;
for (i = 0; s2[i]; ++i) {
s1[i] = s2[i];
}
s1[i] = '\0';
}
void mymemset_int(int *s, int n, int size) {
for (int i = 0; i < size; ++i) {
s[i] = n;
}
}
void mystrcat(char *s1, char *s2) {
char *p1 = s1;
char *p2 = s2;
while (*p1) {
p1++;
}
while (*p2) {
*p1 = *p2;
p1++;
p2++;
}
*p1 = '\0';
}
int mystrlen(char *s) {
char *p = s;
int len = 0;
while (*p) {
p++;
len++;
}
return len;
}
void readHash(FILE *in) {
int len = fread(bufferHash, sizeof(char), MAXHASH, in);
int pos = 0;
int hash_i_line = 0;
int hash_i = 0;
int flag = 1;
while (pos < len) {
int n = bufferHash[pos++];
if (n != 10) {
hash_better[hash_i_line][hash_i] = n == 48 ? -1 : 1;
hash_i_line++;
if (!flag)
flag = 1;
} else {
if (flag) {
flag = 0;
hash_i++;
hash_i_line = 0;
}
}
}
}
int trie_stopWords[1010][26] = {0};
int num_stopWords[1010] = {0};
int pos_stopWords = 0;
int search_stopwords(char *str) {
int p = 0;
for (int i = 0; str[i]; ++i) {
int n = str[i] - 'a';
if (trie_stopWords[p][n] == 0) return 0;
p = trie_stopWords[p][n];
}
return num_stopWords[p]; //就是1
}
void createStopwordsTrie(FILE *in) {
int len = fread(bufferStopChars, sizeof(char), MAXSTOPCHARS, in);
int pos = 0;
int p = 0;
while (pos < len) {
int n = bufferStopChars[pos] - 'a';
pos++;
if (n >= 0 && n <= 25) {
if (trie_stopWords[p][n] == 0) {
trie_stopWords[p][n] = ++pos_stopWords;
}
p = trie_stopWords[p][n];
} else {
num_stopWords[p] = 1;
p = 0;
}
}
}
int trie_article[1500010][26] = {0};
int num_article[1500010] = {0};
typedef struct aword {
char str[80];
int p;
} word;
word word_num_article[106010] = {0};
int pos_word = 0;
int pos_article = 0;
char name_page[PAGENUMMAX][20];
int num_page = 0;
int len_article = 0;
void createArticleTrie(FILE *in) {
len_article = fread(bufferArticleChars, sizeof(char), MAXARTICLECHARS, in);
int p = 0;
int pos = 0;
int i = 0;
int n = bufferArticleChars[pos++];
char temp[80];
int temp_i = 0;
while (n >= 32 && n <= 126) {
name_page[num_page][i++] = n;
n = low(bufferArticleChars[pos]);
pos++;
}
name_page[num_page][i] = '\0';
while (pos ^ len_article) {
n = low(bufferArticleChars[pos]);
pos++;
if (n >= 97 && n <= 122) {
temp[temp_i++] = n;
if (trie_article[p][n - 97] == 0) {
trie_article[p][n - 97] = ++pos_article;
}
p = trie_article[p][n - 97];
} else {
temp[temp_i++] = '\0';
if (search_stopwords(temp) == 0 && num_article[p] == 0) {
mystrcpy(word_num_article[pos_word].str, temp);
word_num_article[pos_word].p = p;
pos_word++;
}
if (n == 12) {
num_page++;
i = 0;
while (!(n >= 32 && n <= 126)) {
pos++;
n = low(bufferArticleChars[pos]);
}
while (n >= 32 && n <= 126) {
name_page[num_page][i++] = n;
pos++;
n = low(bufferArticleChars[pos]);
}
name_page[num_page][i] = '\0';
}
num_article[p] += 1;
temp_i = 0;
p = 0;
}
}
}
int cmp(const void *a, const void *b) {
word *p = (word *) a;
word *q = (word *) b;
if (num_article[p->p] != num_article[q->p]) return num_article[q->p] - num_article[p->p];
return mystrcmp(p->str, q->str);
}
int trie_N_words[101000][26] = {0};
int num_N_words[101000] = {0};
int search_N_words(char *str) {
if (str[0] == '\0') return 0;
int p = 0;
for (int i = 0; str[i]; ++i) {
int n = str[i] - 'a';
if (trie_N_words[p][n] == 0) {
return 0;
}
p = trie_N_words[p][n];
}
return num_N_words[p];
}
void createNWordsTrie() {
int p = 0;
int pos = 0;
int n;
for (int i = 0; i ^ N; ++i) {
for (int j = 0; word_num_article[i].str[j]; ++j) {
n = word_num_article[i].str[j] - 'a';
if (trie_N_words[p][n] == 0) {
trie_N_words[p][n] = ++pos;
}
p = trie_N_words[p][n];
}
num_N_words[p] = i + 1;
p = 0;
}
}
int weightArticle[NMAX] = {0};
int sumSignedWeightArticle[MMAX] = {0};
__uint128_t fingerprintArticle[PAGENUMMAX] = {0};
void reReadArticle() {
int pos = 0;
int i = 0;
int n = 0;
char temp[80];
int temp_i = 0;
while (pos ^ len_article) {
n = low(bufferArticleChars[pos]);
pos++;
if (n >= 97 && n <= 122) {
temp[temp_i++] = n;
} else {
temp[temp_i] = '\0';
int isNWords = search_N_words(temp);
if (isNWords != 0) {
weightArticle[isNWords - 1]++;
}
if (n == 12) {
//0.96429S
for (int j = 0; j < N; ++j) {
if(!weightArticle[j]) continue;
for (int l = 0; l < M; ++l) {
sumSignedWeightArticle[l]+= weightArticle[j] * hash_better[l][j];
}
}
for (int l = 0; l ^ M; ++l) {
if (sumSignedWeightArticle[l] > 0) {
fingerprintArticle[i] = (fingerprintArticle[i] << 1) | 1;
} else {
fingerprintArticle[i] = (fingerprintArticle[i] << 1) | 0;
}
}
mymemset_int(weightArticle, 0, N);
mymemset_int(sumSignedWeightArticle, 0,M);
i++;
}
temp_i = 0;
}
}
}
int hammingDistance(__uint128_t a, __uint128_t b) {
__uint128_t c = a ^ b;
int dist = 0;
while (c != 0) {
dist += 1;
c = c & (c - 1);
}
return dist;
}
int weightSample[NMAX] = {0};
int sumSignWeightSample[MMAX] = {0};
__uint128_t fingerprintSample = 0;
char name_sample[20];
char ans[PAGENUMMAX][4][10005];
int num_sample = 0;
int flag_ans[PAGENUMMAX][4] = {0};
int distance = 0;
char *space = " ";
void readSample(FILE *in, FILE *out) {
int flag = 1;
int len = fread(bufferArticleChars, sizeof(char), MAXARTICLECHARS, in);
int pos = 0;
int i = 0;
int n = 0;
char temp[80];
int temp_i = 0;
int flag_f = 0;
while (pos ^ len) {
n = low(bufferArticleChars[pos]);
if (pos == 0 || flag_f) {
while (!(n >= 48 && n <= 122)) {
n = bufferArticleChars[pos++];
}
while (n != 10 && n != 13) {
name_sample[i] = n;
i++;
n = bufferArticleChars[pos++];
}
name_sample[i] = '\0';
i = 0;
flag_f = 0;
}
pos++;
if (n >= 97 && n <= 122) {
while (n >= 97 && n <= 122) {
temp[temp_i++] = n;
n = low(bufferArticleChars[pos]);
pos++;
}
temp[temp_i++] = '\0';
temp_i = 0;
} else {
temp[temp_i] = '\0';
temp_i = 0;
}
int ppage = search_N_words(temp);
if (ppage != 0) {
weightSample[ppage - 1]++;
}
if (n == 12) {
flag_f = 1;
fingerprintSample = 0;
for (int j = 0; j ^ N;++j){
if(!weightSample[j]) continue;
for (int l = 0; l ^ M; ++l) {
sumSignWeightSample[l]+=weightSample[j] * hash_better[l][j];
}
}
for (int l = 0; l ^ M; ++l) {
if (sumSignWeightSample[l] > 0) {
fingerprintSample = (fingerprintSample << 1) | 1;
} else {
fingerprintSample = (fingerprintSample << 1) | 0;
}
}
mymemset_int(weightSample, 0, N);
mymemset_int(sumSignWeightSample, 0, M);
for (int j = 0; j ^ (num_page + 1); ++j) {
distance = hammingDistance(fingerprintSample, fingerprintArticle[j]);
if (distance <= 3) {
mystrcat(ans[num_sample][distance], name_page[j]);
mystrcat(ans[num_sample][distance], space);
flag_ans[num_sample][distance] = 1;
}
}
fprintf(out, "%s", name_sample);
for (int j = 0; j ^ 4; ++j) {
if (!flag_ans[num_sample][j]) continue;
fprintf(out, "\n%d:", j);
fwrite(ans[num_sample][j], sizeof(char), mystrlen(ans[num_sample][j]), out);
}
fprintf(out, "\n");
num_sample++;
}
}
}
int main(int argc, char *argv[]) {
N = myatoi(argv[1]);
M = myatoi(argv[2]);
FILE *pp_result = fopen("result.txt", "w");
FILE *fp_stop = fopen("stopwords.txt", "rb");
FILE *fp_article = fopen("article.txt", "rb");
FILE *fp_sample = fopen("sample.txt", "rb");
FILE *fp_hashvalue = fopen("hashvalue.txt", "rb");
readHash(fp_hashvalue);
fclose(fp_hashvalue);
createStopwordsTrie(fp_stop);
fclose(fp_stop);
//0.01427S
createArticleTrie(fp_article);
fclose(fp_article);
//0.39820S
qsort(word_num_article, pos_word - 1, sizeof(word_num_article[0]), cmp);
//0.49212S
createNWordsTrie();
//0.50967
reReadArticle();
//3.53592S
readSample(fp_sample, pp_result);
// fclose(fp_sample);
// fclose(pp_result);
//3.63961S
return 0;
}
version5说明
这里我优化了计算指纹的部分。我将计算的循环次序进行了交换,同时我判断weight是否为0,如果是就不需要进行循环了。
这个版本在优化后最终也是跑到了1.15s