PandaWiki/backend/usecase/crawler.go

220 lines
5.6 KiB
Go

package usecase
import (
"context"
"crypto/tls"
"fmt"
"net/http"
"slices"
"github.com/google/uuid"
v1 "github.com/chaitin/panda-wiki/api/crawler/v1"
"github.com/chaitin/panda-wiki/consts"
"github.com/chaitin/panda-wiki/log"
"github.com/chaitin/panda-wiki/mq"
"github.com/chaitin/panda-wiki/pkg/anydoc"
"github.com/chaitin/panda-wiki/store/cache"
"github.com/chaitin/panda-wiki/utils"
)
type CrawlerUsecase struct {
logger *log.Logger
anydocClient *anydoc.Client
httpClient *http.Client
cache *cache.Cache
}
func NewCrawlerUsecase(logger *log.Logger, mqConsumer mq.MQConsumer, cache *cache.Cache) (*CrawlerUsecase, error) {
anydocClient, err := anydoc.NewClient(logger, mqConsumer)
if err != nil {
return nil, err
}
return &CrawlerUsecase{
logger: logger,
anydocClient: anydocClient,
cache: cache,
httpClient: &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: true,
},
},
},
}, nil
}
func (u *CrawlerUsecase) ParseUrl(ctx context.Context, req *v1.CrawlerParseReq) (*v1.CrawlerParseResp, error) {
id := utils.GetFileNameWithoutExt(req.Key)
if !utils.IsUUID(id) {
id = uuid.New().String()
}
// 文件类型的解析会先走上传接口
if req.CrawlerSource.Type() == consts.CrawlerSourceTypeFile {
req.Key = fmt.Sprintf("http://panda-wiki-minio:9000/static-file/%s", req.Key)
}
var (
docs *anydoc.ListDocResponse
err error
)
switch req.CrawlerSource {
case consts.CrawlerSourceFeishu:
docs, err = u.anydocClient.FeishuListDocs(ctx, id, req.FeishuSetting.AppID, req.FeishuSetting.AppSecret, req.FeishuSetting.UserAccessToken, req.FeishuSetting.SpaceId)
if err != nil {
return nil, err
}
case consts.CrawlerSourceUrl, consts.CrawlerSourceFile:
docs, err = u.anydocClient.GetUrlList(ctx, req.Key, id)
if err != nil {
return nil, err
}
case consts.CrawlerSourceConfluence:
docs, err = u.anydocClient.ConfluenceListDocs(ctx, req.Key, req.Filename, id)
if err != nil {
return nil, err
}
case consts.CrawlerSourceEpub:
docs, err = u.anydocClient.EpubpListDocs(ctx, req.Key, req.Filename, id)
if err != nil {
return nil, err
}
case consts.CrawlerSourceMindoc:
docs, err = u.anydocClient.MindocListDocs(ctx, req.Key, req.Filename, id)
if err != nil {
return nil, err
}
case consts.CrawlerSourceWikijs:
docs, err = u.anydocClient.WikijsListDocs(ctx, req.Key, req.Filename, id)
if err != nil {
return nil, err
}
case consts.CrawlerSourceSiyuan:
docs, err = u.anydocClient.SiyuanListDocs(ctx, req.Key, req.Filename, id)
if err != nil {
return nil, err
}
case consts.CrawlerSourceYuque:
docs, err = u.anydocClient.YuqueListDocs(ctx, req.Key, req.Filename, id)
if err != nil {
return nil, err
}
case consts.CrawlerSourceSitemap:
docs, err = u.anydocClient.SitemapListDocs(ctx, req.Key, id)
if err != nil {
return nil, err
}
case consts.CrawlerSourceRSS:
docs, err = u.anydocClient.RssListDocs(ctx, req.Key, id)
if err != nil {
return nil, err
}
case consts.CrawlerSourceNotion:
docs, err = u.anydocClient.NotionListDocs(ctx, req.Key, id)
if err != nil {
return nil, err
}
default:
return nil, fmt.Errorf("parse type %s is not supported", req.CrawlerSource)
}
result := &v1.CrawlerParseResp{
ID: id,
Docs: docs.Data.Docs,
}
return result, nil
}
func (u *CrawlerUsecase) ExportDoc(ctx context.Context, req *v1.CrawlerExportReq) (*v1.CrawlerExportResp, error) {
var taskId string
if req.SpaceId != "" {
urlExportRes, err := u.anydocClient.FeishuExportDoc(ctx, req.ID, req.DocID, req.FileType, req.SpaceId, req.KbID)
if err != nil {
return nil, err
}
taskId = urlExportRes.Data
} else {
urlExportRes, err := u.anydocClient.UrlExport(ctx, req.ID, req.DocID, req.KbID)
if err != nil {
return nil, err
}
taskId = urlExportRes.Data
}
return &v1.CrawlerExportResp{
TaskId: taskId,
}, nil
}
func (u *CrawlerUsecase) ScrapeGetResult(ctx context.Context, taskId string) (*v1.CrawlerResultResp, error) {
taskRes, err := u.anydocClient.TaskList(ctx, []string{taskId})
if err != nil {
return nil, err
}
switch taskRes.Data[0].Status {
case anydoc.StatusPending, anydoc.StatusInProgress:
return &v1.CrawlerResultResp{
Status: consts.CrawlerStatusPending,
}, nil
case anydoc.StatusFailed:
return &v1.CrawlerResultResp{
Status: consts.CrawlerStatusFailed,
}, fmt.Errorf("file crawl failed: %s", taskRes.Data[0].Err)
case anydoc.StatusCompleted:
fileBytes, err := u.anydocClient.DownloadDoc(ctx, taskRes.Data[0].Markdown)
if err != nil {
return nil, err
}
return &v1.CrawlerResultResp{
Status: consts.CrawlerStatusCompleted,
Content: string(fileBytes),
}, nil
default:
return nil, fmt.Errorf("unsupported task status : %s", taskRes.Data[0].Status)
}
}
func (u *CrawlerUsecase) ScrapeGetResults(ctx context.Context, taskIds []string) (*v1.CrawlerResultsResp, error) {
taskRes, err := u.anydocClient.TaskList(ctx, taskIds)
if err != nil {
return nil, err
}
list := make([]v1.CrawlerResultItem, 0)
status := consts.CrawlerStatusCompleted
for i, data := range taskRes.Data {
if slices.Contains([]anydoc.Status{anydoc.StatusPending, anydoc.StatusInProgress}, taskRes.Data[i].Status) {
status = consts.CrawlerStatusPending
}
fileBytes, err := u.anydocClient.DownloadDoc(ctx, data.Markdown)
if err != nil {
return nil, err
}
list = append(list, v1.CrawlerResultItem{
TaskId: taskRes.Data[i].TaskId,
Status: consts.CrawlerStatus(taskRes.Data[i].Status),
Content: string(fileBytes),
})
}
return &v1.CrawlerResultsResp{
Status: status,
List: list,
}, nil
}