大一下数据结构大作业


暂时嘛,啥也没有

蒟蒻博主啥也不会捏

会了也不告诉你

bug

这里是大家可能会遇到的一些问题,我分享一些我的想法

读题

要注意理解题目意思。大致就是首先统计article的词频。接着得到单词中非停用词的词频最多的前N个。(要注意不是每个网页的前N个,而是将所有article的网页的词频统计好之后的前N个(这个也就是特征向量)。然后通过题目描述的计算方式得到指纹。然后对每个sample的网页,同样统计特征向量中每个单词(注意这里变成了特征向量中的单词,而不需要所有的)的词频,通过题目描述的计算方式再次得到指纹。最后将每一个sample网页的指纹与所有article的指纹进行汉明距离的计算,然后统计输出。

第一种,本地不对

本地不管是输出错误还是运行时错误,都可以采取打印调试法和逐步调试法。(debug是必修课喔~~)

第二种,本地对小数据不对

嗯,这种情况其实到后面可以向助教祈求小数据的内容。

首先可能是windows与linux的差异。

也就是linux中每一行结束时\r\n,而windows是\n。

可以将文件打开Mode改为rb,看本地输出是否正确。

第三种,小数据对大数据不对

这一种是最恼火的

如果是输出错误,

  • 千万要注意article和sample的网页名。如果是直接以为就是样例中的1-%d这种的,那么就会出问题。建议改为从数组中解析或者fscanf。

如果是运行时错误,

多半是数组开小了,或者malloc多了。

  • 这种情况下关键是找到哪里运行时错误了。可以在代码的不同位置return 0强制结束然。后看在哪个之前显示的是输出错误,过了那个就变成了运行时错误,那么就仔细看看应该如何解决。

题目

code

// N 5000
// M 64
#include 
#include 
#pragma GCC optimize("Ofast")
#pragma once
#pragma pack (16)
#define low(c) ((c>=65&&c<=90 )? 32+c:c)
//#define low(c) (c|0x20)
#define NMAX 10005
#define MMAX 130
#define MAXHASH 1280010
char bufferHash[MAXHASH];
#define MAXSTOPCHARS 10000
char bufferStopChars[MAXSTOPCHARS];
#define MAXARTICLECHARS 350000000
char bufferArticleChars[MAXARTICLECHARS];
#define PAGENUMMAX 16000
int hash_better[MMAX][NMAX];
int N, M;
int myatoi(char s[]) {
    int i = 0;
    int tmp = 0;
    while (s[i] != '\0') {
        if (s[i] >= 48 && s[i] <= 57) {
            tmp = tmp * 10 + s[i] - '0';
            i++;
        } else {
            return 0;
        }
    }
    return tmp;
}
int mystrcmp(char *s1, char *s2) {
    char *p1 = s1;
    char *p2 = s2;
    while (*p1 && *p2) {
        if (*p1 != *p2) {
            return *p1 - *p2;
        }
        p1++;
        p2++;
    }
    if (*p1) {
        return 1;
    } else if (*p2) {
        return -1;
    } else {
        return 0;
    }
}
void mystrcpy(char *s1, const char *s2) {
    int i;
    for (i = 0; s2[i]; ++i) {
        s1[i] = s2[i];
    }
    s1[i] = '\0';
}
void mymemset_int(int *s, int n, int size) {
    for (int i = 0; i < size; ++i) {
        s[i] = n;
    }
}
void mystrcat(char *s1, char *s2) {
    char *p1 = s1;
    char *p2 = s2;
    while (*p1) {
        p1++;
    }
    while (*p2) {
        *p1 = *p2;
        p1++;
        p2++;
    }
    *p1 = '\0';
}
int mystrlen(char *s) {
    char *p = s;
    int len = 0;
    while (*p) {
        p++;
        len++;
    }
    return len;
}
void readHash(FILE *in) {
    int len = fread(bufferHash, sizeof(char), MAXHASH, in);
    int pos = 0;
    int hash_i_line = 0;
    int hash_i = 0;
    int flag = 1;
    while (pos < len) {
        int n = bufferHash[pos++];
        if (n != 10) {
            hash_better[hash_i_line][hash_i] = n == 48 ? -1 : 1;
            hash_i_line++;
            if (!flag)
                flag = 1;
        } else {
            if (flag) {
                flag = 0;
                hash_i++;
                hash_i_line = 0;
            }
        }
    }
}
int trie_stopWords[1010][26] = {0};
int num_stopWords[1010] = {0};
int pos_stopWords = 0;
int search_stopwords(char *str) {
    int p = 0;
    for (int i = 0; str[i]; ++i) {
        int n = str[i] - 'a';
        if (trie_stopWords[p][n] == 0) return 0;
        p = trie_stopWords[p][n];
    }
    return num_stopWords[p];       //就是1
}
void createStopwordsTrie(FILE *in) {
    int len = fread(bufferStopChars, sizeof(char), MAXSTOPCHARS, in);
    int pos = 0;
    int p = 0;
    while (pos < len) {
        int n = bufferStopChars[pos] - 'a';
        pos++;
        if (n >= 0 && n <= 25) {
            if (trie_stopWords[p][n] == 0) {
                trie_stopWords[p][n] = ++pos_stopWords;
            }
            p = trie_stopWords[p][n];
        } else {
            num_stopWords[p] = 1;
            p = 0;
        }
    }
}
int trie_article[1500010][26] = {0};
int num_article[1500010] = {0};
typedef struct aword {
    char str[80];
    int p;
} word;
word word_num_article[106010] = {0};
int pos_word = 0;
int pos_article = 0;
char name_page[PAGENUMMAX][20];
int num_page = 0;
int len_article = 0;
void createArticleTrie(FILE *in) {
    len_article = fread(bufferArticleChars, sizeof(char), MAXARTICLECHARS, in);
    int p = 0;
    int pos = 0;
    int i = 0;
    int n = bufferArticleChars[pos++];
    char temp[80];
    int temp_i = 0;
    while (n >= 32 && n <= 126) {
        name_page[num_page][i++] = n;
        n = low(bufferArticleChars[pos]);
        pos++;
    }
    name_page[num_page][i] = '\0';
    while (pos ^ len_article) {
        n = low(bufferArticleChars[pos]);
        pos++;
        if (n >= 97 && n <= 122) {
            temp[temp_i++] = n;
            if (trie_article[p][n - 97] == 0) {
                trie_article[p][n - 97] = ++pos_article;
            }
            p = trie_article[p][n - 97];
        } else {
            temp[temp_i++] = '\0';
            if (search_stopwords(temp) == 0 && num_article[p] == 0) {
                mystrcpy(word_num_article[pos_word].str, temp);
                word_num_article[pos_word].p = p;
                pos_word++;
            }
            if (n == 12) {
                num_page++;
                i = 0;
                while (!(n >= 32 && n <= 126)) {
                    pos++;
                    n = low(bufferArticleChars[pos]);
                }
                while (n >= 32 && n <= 126) {
                    name_page[num_page][i++] = n;
                    pos++;
                    n = low(bufferArticleChars[pos]);
                }
                name_page[num_page][i] = '\0';
            }
            num_article[p] += 1;
            temp_i = 0;
            p = 0;
        }
    }
}
int cmp(const void *a, const void *b) {
    word *p = (word *) a;
    word *q = (word *) b;
    if (num_article[p->p] != num_article[q->p]) return num_article[q->p] - num_article[p->p];
    return mystrcmp(p->str, q->str);
}
int trie_N_words[101000][26] = {0};
int num_N_words[101000] = {0};
int search_N_words(char *str) {
    if (str[0] == '\0') return 0;
    int p = 0;
    for (int i = 0; str[i]; ++i) {
        int n = str[i] - 'a';
        if (trie_N_words[p][n] == 0) {
            return 0;
        }
        p = trie_N_words[p][n];
    }
    return num_N_words[p];
}
void createNWordsTrie() {
    int p = 0;
    int pos = 0;
    int n;
    for (int i = 0; i ^ N; ++i) {
        for (int j = 0; word_num_article[i].str[j]; ++j) {
            n = word_num_article[i].str[j] - 'a';
            if (trie_N_words[p][n] == 0) {
                trie_N_words[p][n] = ++pos;
            }
            p = trie_N_words[p][n];
        }
        num_N_words[p] = i + 1;
        p = 0;
    }
}
int weightArticle[NMAX] = {0};
int sumSignedWeightArticle[MMAX] = {0};
__uint128_t fingerprintArticle[PAGENUMMAX] = {0};
void reReadArticle() {
    int pos = 0;
    int i = 0;
    int n = 0;
    char temp[80];
    int temp_i = 0;
    while (pos ^ len_article) {
        n = low(bufferArticleChars[pos]);
        pos++;
        if (n >= 97 && n <= 122) {
            temp[temp_i++] = n;
        } else {
            temp[temp_i] = '\0';
            int isNWords = search_N_words(temp);
            if (isNWords != 0) {
                weightArticle[isNWords - 1]++;
            }
            if (n == 12) {
                //0.96429S
                for (int j = 0; j < N; ++j) {
                    if(!weightArticle[j]) continue;
                    for (int l = 0; l < M; ++l) {
                        sumSignedWeightArticle[l]+= weightArticle[j] * hash_better[l][j];
                    }
                }
                for (int l = 0; l ^ M; ++l) {
                    if (sumSignedWeightArticle[l] > 0) {
                        fingerprintArticle[i] = (fingerprintArticle[i] << 1) | 1;
                    } else {
                        fingerprintArticle[i] = (fingerprintArticle[i] << 1) | 0;
                    }
                }
                mymemset_int(weightArticle, 0, N);
                mymemset_int(sumSignedWeightArticle, 0,M);
                i++;
            }
            temp_i = 0;
        }
    }
}
int hammingDistance(__uint128_t a, __uint128_t b) {
    __uint128_t c = a ^ b;
    int dist = 0;
    while (c != 0) {
        dist += 1;
        c = c & (c - 1);
    }
    return dist;
}
int weightSample[NMAX] = {0};
int sumSignWeightSample[MMAX] = {0};
__uint128_t fingerprintSample = 0;
char name_sample[20];
char ans[PAGENUMMAX][4][10005];
int num_sample = 0;
int flag_ans[PAGENUMMAX][4] = {0};
int distance = 0;
char *space = " ";
void readSample(FILE *in, FILE *out) {
    int flag = 1;
    int len = fread(bufferArticleChars, sizeof(char), MAXARTICLECHARS, in);
    int pos = 0;
    int i = 0;
    int n = 0;
    char temp[80];
    int temp_i = 0;
    int flag_f = 0;
    while (pos ^ len) {
        n = low(bufferArticleChars[pos]);
        if (pos == 0 || flag_f) {
            while (!(n >= 48 && n <= 122)) {
                n = bufferArticleChars[pos++];
            }
            while (n != 10 && n != 13) {
                name_sample[i] = n;
                i++;
                n = bufferArticleChars[pos++];
            }
            name_sample[i] = '\0';
            i = 0;
            flag_f = 0;
        }
        pos++;
        if (n >= 97 && n <= 122) {
            while (n >= 97 && n <= 122) {
                temp[temp_i++] = n;
                n = low(bufferArticleChars[pos]);
                pos++;
            }
            temp[temp_i++] = '\0';
            temp_i = 0;
        } else {
            temp[temp_i] = '\0';
            temp_i = 0;
        }
        int ppage = search_N_words(temp);
        if (ppage != 0) {
            weightSample[ppage - 1]++;
        }
        if (n == 12) {
            flag_f = 1;
            fingerprintSample = 0;
            for (int j = 0; j ^ N;++j){
                if(!weightSample[j]) continue;
                for (int l = 0; l ^ M; ++l) {
                    sumSignWeightSample[l]+=weightSample[j] * hash_better[l][j];
                }
            }
            for (int l = 0; l ^ M; ++l) {
                if (sumSignWeightSample[l] > 0) {
                    fingerprintSample = (fingerprintSample << 1) | 1;
                } else {
                    fingerprintSample = (fingerprintSample << 1) | 0;
                }
            }
            mymemset_int(weightSample, 0, N);
            mymemset_int(sumSignWeightSample, 0, M);
            for (int j = 0; j ^ (num_page + 1); ++j) {
                distance = hammingDistance(fingerprintSample, fingerprintArticle[j]);
                if (distance <= 3) {
                    mystrcat(ans[num_sample][distance], name_page[j]);
                    mystrcat(ans[num_sample][distance], space);
                    flag_ans[num_sample][distance] = 1;
                }
            }
            fprintf(out, "%s", name_sample);
            for (int j = 0; j ^ 4; ++j) {
                if (!flag_ans[num_sample][j]) continue;
                fprintf(out, "\n%d:", j);
                fwrite(ans[num_sample][j], sizeof(char), mystrlen(ans[num_sample][j]), out);
            }
            fprintf(out, "\n");
            num_sample++;
        }
    }
}
int main(int argc, char *argv[]) {
    N = myatoi(argv[1]);
    M = myatoi(argv[2]);
    FILE *pp_result = fopen("result.txt", "w");
    FILE *fp_stop = fopen("stopwords.txt", "rb");
    FILE *fp_article = fopen("article.txt", "rb");
    FILE *fp_sample = fopen("sample.txt", "rb");
    FILE *fp_hashvalue = fopen("hashvalue.txt", "rb");
    readHash(fp_hashvalue);
    fclose(fp_hashvalue);
    createStopwordsTrie(fp_stop);
    fclose(fp_stop);
    //0.01427S
    createArticleTrie(fp_article);
    fclose(fp_article);
    //0.39820S
    qsort(word_num_article, pos_word - 1, sizeof(word_num_article[0]), cmp);
    //0.49212S
    createNWordsTrie();
    //0.50967
    reReadArticle();
    //3.53592S
    readSample(fp_sample, pp_result);
//    fclose(fp_sample);
//    fclose(pp_result);
    //3.63961S
    return 0;
}

version5说明

这里我优化了计算指纹的部分。我将计算的循环次序进行了交换,同时我判断weight是否为0,如果是就不需要进行循环了。

这个版本在优化后最终也是跑到了1.15s


文章作者: hugo
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 hugo !
  目录