动漫制作专业的认知_市场监督管理总局官网_关键词推广优化排名品牌_seo实战密码第四版

存在搜到重复内容

把ID相同的内容合并掉，将合并的内容权值叠加上
加入去重功能

#pragma once
#include "index.hpp"
#include "util.hpp"
#include "log.hpp"
#include <algorithm>
#include <unordered_map>
#include <jsoncpp/json/json.h>
namespace ns_searcher{struct InvertedElemPrint{uint64_t doc_id;int weight;std::vector<std::string> words;InvertedElemPrint():doc_id(0), weight(0){}};class Searcher{private:ns_index::Index *index; //供系统进行查找的索引public:Searcher(){}~Searcher(){}public:void InitSearcher(const std::string &input){//1. 获取或者创建index对象index = ns_index::Index::GetInstance();//std::cout << "获取index单例成功..." << std::endl;LOG(NORMAL, "获取index单例成功...");//2. 根据index对象建立索引index->BuildIndex(input);//std::cout << "建立正排和倒排索引成功..." << std::endl;LOG(NORMAL, "建立正排和倒排索引成功...");}//query: 搜索关键字//json_string: 返回给用户浏览器的搜索结果void Search(const std::string &query, std::string *json_string){//1.[分词]:对我们的query进行按照searcher的要求进行分词std::vector<std::string> words;ns_util::JiebaUtil::CutString(query, &words);//2.[触发]:就是根据分词的各个"词"，进行index查找,建立index是忽略大小写，所以搜索，关键字也需要//ns_index::InvertedList inverted_list_all; //内部InvertedElemstd::vector<InvertedElemPrint> inverted_list_all;std::unordered_map<uint64_t, InvertedElemPrint> tokens_map;for(std::string word : words){boost::to_lower(word);ns_index::InvertedList *inverted_list = index->GetInvertedList(word);if(nullptr == inverted_list){continue;}//不完美的地方：暂时可以交给大家 , 你/是/一个/好人 100//inverted_list_all.insert(inverted_list_all.end(), inverted_list->begin(), inverted_list->end());for(const auto &elem : *inverted_list){auto &item = tokens_map[elem.doc_id]; //[]:如果存在直接获取，如果不存在新建//item一定是doc_id相同的print节点item.doc_id = elem.doc_id;item.weight += elem.weight;item.words.push_back(elem.word);}}for(const auto &item : tokens_map){inverted_list_all.push_back(std::move(item.second));}//3.[合并排序]：汇总查找结果，按照相关性(weight)降序排序//std::sort(inverted_list_all.begin(), inverted_list_all.end(),\//      [](const ns_index::InvertedElem &e1, const ns_index::InvertedElem &e2){//        return e1.weight > e2.weight;//        });std::sort(inverted_list_all.begin(), inverted_list_all.end(),\[](const InvertedElemPrint &e1, const InvertedElemPrint &e2){return e1.weight > e2.weight;});//4.[构建]:根据查找出来的结果，构建json串 -- jsoncpp --通过jsoncpp完成序列化&&反序列化Json::Value root;for(auto &item : inverted_list_all){ns_index::DocInfo * doc = index->GetForwardIndex(item.doc_id);if(nullptr == doc){continue;}Json::Value elem;elem["title"] = doc->title;elem["desc"] = GetDesc(doc->content, item.words[0]); //content是文档的去标签的结果，但是不是我们想要的，我们要的是一部分 TODOelem["url"]  = doc->url;//for deubg, for deleteelem["id"] = (int)item.doc_id;elem["weight"] = item.weight; //int->stringroot.append(elem);}//Json::StyledWriter writer;Json::FastWriter writer;*json_string = writer.write(root);}std::string GetDesc(const std::string &html_content, const std::string &word){//找到word在html_content中的首次出现，然后往前找50字节(如果没有，从begin开始)，往后找100字节(如果没有，到end就可以的)//截取出这部分内容const int prev_step = 50;const int next_step = 100;//1. 找到首次出现auto iter = std::search(html_content.begin(), html_content.end(), word.begin(), word.end(), [](int x, int y){return (std::tolower(x) == std::tolower(y));});if(iter == html_content.end()){return "None1";}int pos = std::distance(html_content.begin(), iter);//2. 获取start，end , std::size_t 无符号整数int start = 0; int end = html_content.size() - 1;//如果之前有50+字符，就更新开始位置if(pos > start + prev_step) start = pos - prev_step;if(pos < end - next_step) end = pos + next_step;//3. 截取子串,returnif(start >= end) return "None2";std::string desc = html_content.substr(start, end - start);desc += "...";return desc;}};
}

添加日志

创建一个log.hpp
![[Pasted image 20250221110724.png]]

#pragma once
#include <iostream>
#include <string>
#include <ctime>
#define NORMAL  1
#define WARNING 2
#define DEBUG   3
#define FATAL   4#define LOG(LEVEL, MESSAGE) log(#LEVEL, MESSAGE, __FILE__, __LINE__)void log(std::string level, std::string message, std::string file, int line)
{std::cout << "[" << level << "]" << "[" << time(nullptr) << "]" << "[" << message << "]" << "[" << file << " : " << line << "]" << std::endl;
}

index.hpp

bool BuildIndex(const std::string &input) //parse处理完毕的数据交给我
{std::ifstream in(input, std::ios::in | std::ios::binary);if(!in.is_open()){std::cerr << "sorry, " << input << " open error" << std::endl;return false;}std::string line;int count = 0;while(std::getline(in, line)){DocInfo * doc = BuildForwardIndex(line);if(nullptr == doc){std::cerr << "build " << line << " error" << std::endl; //for deubgcontinue;}BuildInvertedIndex(*doc);count++;if(count % 50 == 0){//std::cout << "当前已经建立的索引文档：" << count << std::endl;LOG(NORMAL, "当前的已经建立的索引文档: " + std::to_string(count));}}return true;
}

searcher.hpp

void InitSearcher(const std::string &input)
{//1. 获取或者创建index对象index = ns_index::Index::GetInstance();//std::cout << "获取index单例成功..." << std::endl;LOG(NORMAL, "获取index单例成功...");//2. 根据index对象建立索引index->BuildIndex(input);//std::cout << "建立正排和倒排索引成功..." << std::endl;LOG(NORMAL, "建立正排和倒排索引成功...");
}

http_server.cc

#include "cpp-httplib/httplib.h"
#include "searcher.hpp"const std::string input = "data/raw_html/raw.txt";
const std::string root_path = "./wwwroot";int main()
{ns_searcher::Searcher search;search.InitSearcher(input);httplib::Server svr;svr.set_base_dir(root_path.c_str());svr.Get("/s", [&search](const httplib::Request &req, httplib::Response &rsp){if(!req.has_param("word")){rsp.set_content("必须要有搜索关键字!", "text/plain; charset=utf-8");return;}std::string word = req.get_param_value("word");//std::cout << "用户在搜索：" << word << std::endl;LOG(NORMAL, "用户搜索的: " + word);std::string json_string;search.Search(word, &json_string);rsp.set_content(json_string, "application/json");//rsp.set_content("你好,世界!", "text/plain; charset=utf-8");});LOG(NORMAL, "服务器启动成功...");svr.listen("0.0.0.0", 8081);return 0;
}

![[Pasted image 20250221112955.png]]