Skip to content

Commit

Permalink
fix 修复一些死锁问题;katana 爬虫goroutine泄露;其他问题
Browse files Browse the repository at this point in the history
  • Loading branch information
yhy0 committed Jul 5, 2024
1 parent 11abb0e commit 2836308
Show file tree
Hide file tree
Showing 90 changed files with 1,066 additions and 81,302 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
<img alt="Release" src="https://img.shields.io/github/license/yhy0/Jie"/>
</a>
<a href="https://github.com/yhy0/Jie">
<img alt="Release" src="https://img.shields.io/badge/release-v1.1.1-brightgreen"/>
<img alt="Release" src="https://img.shields.io/badge/release-v1.2.0-brightgreen"/>
</a>
<a href="https://github.com/yhy0/Jie">
<img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/yhy0/Jie?color=9cf"/>
Expand Down
2 changes: 1 addition & 1 deletion README_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
<img alt="Release" src="https://img.shields.io/github/license/yhy0/Jie"/>
</a>
<a href="https://github.com/yhy0/Jie">
<img alt="Release" src="https://img.shields.io/badge/release-v1.1.1-brightgreen"/>
<img alt="Release" src="https://img.shields.io/badge/release-v1.2.0-brightgreen"/>
</a>
<a href="https://github.com/yhy0/Jie">
<img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/yhy0/Jie?color=9cf"/>
Expand Down
2 changes: 1 addition & 1 deletion conf/banner.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ var Banner = `

const Website = "https://github.com/yhy0/Jie"

const Version = "1.1.1"
const Version = "1.2.0"
3 changes: 3 additions & 0 deletions conf/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package conf

import (
folderutil "github.com/projectdiscovery/utils/folder"
wappalyzer "github.com/projectdiscovery/wappalyzergo"
"path/filepath"
)

Expand All @@ -20,6 +21,8 @@ var NoProgressBar bool
// FilePath 一些配置文件的默认位置
var FilePath string

var Wappalyzer *wappalyzer.Wappalyze

func init() {
homedir := folderutil.HomeDirOrDefault("")

Expand Down
6 changes: 6 additions & 0 deletions conf/envCheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package conf
import (
"fmt"
"github.com/go-rod/rod/lib/launcher"
wappalyzer "github.com/projectdiscovery/wappalyzergo"
"os"
"os/exec"
)
Expand All @@ -16,6 +17,11 @@ import (
var ChromePath string

func Preparations() {
if Wappalyzer == nil {
// wappalyzergo 中已经处理了 syscall.Dup2(int(devNull.Fd()), int(os.Stderr.Fd())) ,单元测试也是 ok 的,这里为啥还会有
Wappalyzer, _ = wappalyzer.New()
}

if !GlobalConfig.NoPortScan { // 不进行端口扫描时,不检查这些
Plugin["portScan"] = false
// 检查 nmap 是否已安装
Expand Down
4 changes: 2 additions & 2 deletions crawler/crawlergo/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@ package config_test
import (
"github.com/yhy0/Jie/crawler/crawlergo/config"
"testing"

"github.com/stretchr/testify/assert"
)

func TestStaticSuffix(t *testing.T) {
assert.Equal(t, true, config.StaticSuffixSet.Contains("png"))
assert.Equal(t, false, config.StaticSuffixSet.Contains("demo"))

assert.Equal(t, true, config.ScriptSuffixSet.Contains("asp"))
assert.Equal(t, false, config.ScriptSuffixSet.Contains("demo"))
}
2 changes: 1 addition & 1 deletion crawler/crawlergo/domain_collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package crawlergo
import (
"github.com/yhy0/Jie/crawler/crawlergo/model"
"strings"

mapset "github.com/deckarep/golang-set/v2"
)

Expand Down
24 changes: 12 additions & 12 deletions crawler/crawlergo/engine/after_dom_tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,23 @@ import (
"os"
"strings"
"time"

"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/chromedp"
)

// AfterDOMRun 在DOMContentLoaded完成后执行
func (tab *Tab) AfterDOMRun() {
defer tab.WG.Done()

// logging.Logger.Debug("afterDOMRun start")

// 获取当前body节点的nodeId 用于之后查找子节点
if !tab.getBodyNodeId() {
// logging.Logger.Debug("no body document NodeID, exit.")
return
}

tab.domWG.Add(2)
go tab.fillForm()
go tab.setObserverJS()
Expand Down Expand Up @@ -62,11 +62,11 @@ func (tab *Tab) fillForm() {
f := FillForm{
tab: tab,
}

go f.fillInput()
go f.fillMultiSelect()
go f.fillTextarea()

tab.fillFormWG.Wait()
// logging.Logger.Debug("fillForm end")
}
Expand All @@ -89,7 +89,7 @@ func (f *FillForm) fillInput() {
defer f.tab.fillFormWG.Done()
var nodes []*cdp.Node
ctx := f.tab.GetExecutor()

tCtx, cancel := context.WithTimeout(ctx, time.Second*2)
defer cancel()
// 首先判断input标签是否存在,减少等待时间 提前退出
Expand All @@ -103,13 +103,13 @@ func (f *FillForm) fillInput() {
}
// 获取所有的input标签
err := chromedp.Nodes(`input`, &nodes, chromedp.ByQueryAll).Do(tCtx)

if err != nil {
logging.Logger.Debug("get all input element err")
logging.Logger.Debug(err)
return
}

// 找出 type 为空 或者 type=text
for _, node := range nodes {
// 兜底超时
Expand Down Expand Up @@ -151,7 +151,7 @@ func (f *FillForm) fillTextarea() {
tCtx, cancel := context.WithTimeout(ctx, time.Second*2)
defer cancel()
value := f.GetMatchInputText("other")

textareaNodes, textareaErr := f.tab.GetNodeIDs(`textarea`)
if textareaErr != nil || len(textareaNodes) == 0 {
// logging.Logger.Debug("fillTextarea: get textarea element err")
Expand All @@ -160,7 +160,7 @@ func (f *FillForm) fillTextarea() {
}
return
}

_ = chromedp.SendKeys(textareaNodes, value, chromedp.ByNodeID).Do(tCtx)
}

Expand Down Expand Up @@ -188,7 +188,7 @@ func (f *FillForm) GetMatchInputText(name string) string {
return value
}
}

name = strings.ToLower(name)
for key, item := range config.InputTextMap {
for _, keyword := range item["keyword"].([]string) {
Expand Down
28 changes: 14 additions & 14 deletions crawler/crawlergo/engine/after_loaded_tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
"github.com/yhy0/Jie/crawler/crawlergo/tools"
"github.com/yhy0/logging"
"time"

"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/chromedp"
)
Expand All @@ -27,11 +27,11 @@ func (tab *Tab) AfterLoadedRun() {
tab.formSubmitWG.Add(2)
tab.loadedWG.Add(3)
tab.removeLis.Add(1)

go tab.formSubmit()
tab.formSubmitWG.Wait()
// logging.Logger.Debug("formSubmit end")

if tab.config.EventTriggerMode == config.EventTriggerAsync {
go tab.triggerJavascriptProtocol()
go tab.triggerInlineEvents()
Expand All @@ -44,10 +44,10 @@ func (tab *Tab) AfterLoadedRun() {
time.Sleep(tab.config.EventTriggerInterval)
tab.triggerJavascriptProtocol()
}

// 事件触发之后 需要等待一点时间让浏览器成功发出ajax请求 更新DOM
time.Sleep(tab.config.BeforeExitDelay)

go tab.RemoveDOMListener()
tab.removeLis.Wait()
// logging.Logger.Debug("afterLoadedRun end")
Expand All @@ -58,12 +58,12 @@ func (tab *Tab) AfterLoadedRun() {
自动化点击提交表单
*/
func (tab *Tab) formSubmit() {

// logging.Logger.Debug("formSubmit start")

// 首先对form表单设置target
tab.setFormToFrame()

// 接下来尝试三种方式提交表单
go tab.clickSubmit()
go tab.clickAllButton()
Expand All @@ -77,7 +77,7 @@ func (tab *Tab) setFormToFrame() {
// 首先新建 frame
nameStr := tools.RandSeq(8)
tab.Evaluate(fmt.Sprintf(js.NewFrameTemplate, nameStr, nameStr))

// 接下来将所有的 form 节点target都指向它
ctx := tab.GetExecutor()
formNodes, formErr := tab.GetNodeIDs(`form`)
Expand All @@ -99,10 +99,10 @@ func (tab *Tab) setFormToFrame() {
*/
func (tab *Tab) clickSubmit() {
defer tab.formSubmitWG.Done()

// 首先点击按钮 type=submit
ctx := tab.GetExecutor()

// 获取所有的form节点 直接执行submit
formNodes, formErr := tab.GetNodeIDs(`form`)
if formErr != nil || len(formNodes) == 0 {
Expand All @@ -115,7 +115,7 @@ func (tab *Tab) clickSubmit() {
tCtx1, cancel1 := context.WithTimeout(ctx, time.Second*2)
defer cancel1()
_ = chromedp.Submit(formNodes, chromedp.ByNodeID).Do(tCtx1)

// 获取所有的input标签
inputNodes, inputErr := tab.GetNodeIDs(`form input[type=submit]`)
if inputErr != nil || len(inputNodes) == 0 {
Expand All @@ -136,7 +136,7 @@ click all button
*/
func (tab *Tab) clickAllButton() {
defer tab.formSubmitWG.Done()

// 获取所有的form中的button节点
ctx := tab.GetExecutor()
// 获取所有的button标签
Expand All @@ -151,7 +151,7 @@ func (tab *Tab) clickAllButton() {
tCtx, cancel1 := context.WithTimeout(ctx, time.Second*2)
defer cancel1()
_ = chromedp.Click(btnNodeIDs, chromedp.ByNodeID).Do(tCtx)

// 使用JS的click方法进行点击
var btnNodes []*cdp.Node
tCtx2, cancel2 := context.WithTimeout(ctx, time.Second*2)
Expand Down
20 changes: 10 additions & 10 deletions crawler/crawlergo/engine/browser.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"log"
"sync"
"time"

"github.com/chromedp/cdproto/browser"
"github.com/chromedp/chromedp"
)
Expand All @@ -33,28 +33,28 @@ func InitBrowser(proxy string, noHeadless bool) *Browser {
chromedp.Flag("no-sandbox", true),
// 忽略证书错误
chromedp.Flag("ignore-certificate-errors", true),

chromedp.Flag("disable-images", true),
//
chromedp.Flag("disable-web-security", true),
//
chromedp.Flag("disable-xss-auditor", true),
//
chromedp.Flag("disable-setuid-sandbox", true),

chromedp.Flag("allow-running-insecure-content", true),

chromedp.Flag("disable-webgl", true),

chromedp.Flag("disable-popup-blocking", true),

chromedp.WindowSize(1920, 1080),
)
// 设置浏览器代理
if proxy != "" {
opts = append(opts, chromedp.ProxyServer(proxy))
}

allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...)
bctx, _ := chromedp.NewContext(allocCtx,
chromedp.WithLogf(log.Printf),
Expand All @@ -68,7 +68,7 @@ func InitBrowser(proxy string, noHeadless bool) *Browser {
}
bro.Cancel = &cancel
bro.Ctx = &bctx

return &bro
}

Expand All @@ -78,7 +78,7 @@ func ConnectBrowser(wsUrl string, extraHeaders map[string]interface{}) *Browser
bctx, _ := chromedp.NewContext(allocCtx,
chromedp.WithLogf(log.Printf),
)

err := chromedp.Run(bctx)
if err != nil {
// couldn't connect to the remote browser, need to exit
Expand All @@ -87,7 +87,7 @@ func ConnectBrowser(wsUrl string, extraHeaders map[string]interface{}) *Browser
bro.Cancel = &cancel
bro.Ctx = &bctx
bro.ExtraHeaders = extraHeaders

return &bro
}

Expand Down
16 changes: 15 additions & 1 deletion crawler/crawlergo/engine/intercept_request.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bufio"
"context"
"encoding/base64"
"fmt"
regexp "github.com/wasilibs/go-re2"
"github.com/yhy0/Jie/crawler/crawlergo/config"
"github.com/yhy0/Jie/crawler/crawlergo/model"
Expand Down Expand Up @@ -33,9 +34,22 @@ func (tab *Tab) InterceptRequest(v *fetch.EventRequestPaused) {
_ = fetch.ContinueRequest(v.RequestID).Do(ctx)
return
}
var postData string
if _req.HasPostData && len(_req.PostDataEntries) > 0 {
for p := range _req.PostDataEntries {
postData += _req.PostDataEntries[p].Bytes
}
}
fmt.Println("Post data ", postData)
data, err := base64.StdEncoding.DecodeString(postData)
if err != nil {
logging.Logger.Errorln(err)
}
fmt.Println("Post data base64 ", data)

_option := model.Options{
Headers: _req.Headers,
PostData: _req.PostData,
PostData: postData,
}
req := model.GetRequest(_req.Method, url, _option)

Expand Down
Loading

0 comments on commit 2836308

Please sign in to comment.