Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in
Toggle navigation
M
mongoose
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esp
mongoose
Commits
ff159bf3
Commit
ff159bf3
authored
Aug 16, 2016
by
Artem Bulavin
Committed by
Cesanta Bot
Aug 16, 2016
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Simple mongoose http urls crawler
PUBLISHED_FROM=4eead54610606827963e7c244fcd8ab9a13d4c07
parent
8cdd19bc
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
94 additions
and
0 deletions
+94
-0
Makefile
examples/simple_crawler/Makefile
+3
-0
simple_crawler.c
examples/simple_crawler/simple_crawler.c
+91
-0
No files found.
examples/simple_crawler/Makefile
0 → 100644
View file @
ff159bf3
PROG
=
simple_crawler
MODULE_CFLAGS
=
../../../slre/slre.c
include
../examples.mk
examples/simple_crawler/simple_crawler.c
0 → 100644
View file @
ff159bf3
#include <stdio.h>
#include <string.h>
#include "mongoose.h"
#include "../../../slre/slre.h"
static
const
char
*
regex
=
"href=
\"
((https?://)[^
\\
s/'
\"
<>]+/?[^
\\
s'
\"
<>]*)"
;
const
int
max_depth
=
2
;
struct
userdata
{
char
*
url
;
int
depth
;
};
void
crawl_page
(
struct
mg_mgr
*
mgr
,
const
char
*
url
,
size_t
url_len
,
int
depth
);
void
handle_reply
(
struct
mg_connection
*
nc
,
struct
http_message
*
hm
);
static
void
event_handler
(
struct
mg_connection
*
nc
,
int
event
,
void
*
data
)
{
struct
http_message
*
hm
=
(
struct
http_message
*
)
data
;
int
connect_status
;
switch
(
event
)
{
case
MG_EV_CONNECT
:
connect_status
=
*
(
int
*
)
data
;
if
(
connect_status
!=
0
)
{
printf
(
"Error while loading page: %s, error: %s
\n
"
,
((
struct
userdata
*
)
nc
->
user_data
)
->
url
,
strerror
(
connect_status
));
}
break
;
case
MG_EV_CLOSE
:
free
(((
struct
userdata
*
)
nc
->
user_data
)
->
url
);
free
(
nc
->
user_data
);
break
;
case
MG_EV_HTTP_REPLY
:
handle_reply
(
nc
,
hm
);
nc
->
flags
|=
MG_F_SEND_AND_CLOSE
;
break
;
default:
break
;
}
}
int
main
()
{
struct
mg_mgr
mgr
;
mg_mgr_init
(
&
mgr
,
NULL
);
crawl_page
(
&
mgr
,
"http://www.simpleweb.org/"
,
~
0
,
0
);
for
(;;)
{
mg_mgr_poll
(
&
mgr
,
1000
);
}
mg_mgr_free
(
&
mgr
);
return
0
;
}
void
crawl_page
(
struct
mg_mgr
*
mgr
,
const
char
*
url
,
size_t
url_len
,
int
depth
)
{
struct
mg_connection
*
nc
;
struct
userdata
*
data
=
malloc
(
sizeof
(
struct
userdata
));
if
(
url_len
==
(
size_t
)
~
0
)
{
url_len
=
strlen
(
url
);
}
data
->
url
=
strncpy
(
malloc
(
url_len
+
1
),
url
,
url_len
);
data
->
url
[
url_len
]
=
'\0'
;
data
->
depth
=
depth
;
nc
=
mg_connect_http
(
mgr
,
event_handler
,
url
,
NULL
,
NULL
);
nc
->
user_data
=
data
;
}
void
handle_reply
(
struct
mg_connection
*
nc
,
struct
http_message
*
hm
)
{
struct
userdata
*
ud
=
(
struct
userdata
*
)
nc
->
user_data
;
const
char
*
body
=
hm
->
body
.
p
;
int
offset
,
max_matches
=
2
,
cursor
=
0
,
str_len
=
strlen
(
body
);
struct
slre_cap
caps
[
max_matches
];
printf
(
"Loaded url: %s at depth %d
\n
"
,
ud
->
url
,
ud
->
depth
);
if
(
ud
->
depth
==
max_depth
)
{
return
;
}
while
(
cursor
<
str_len
&&
(
offset
=
slre_match
(
regex
,
body
+
cursor
,
str_len
-
cursor
,
caps
,
max_matches
,
SLRE_IGNORE_CASE
))
>
0
)
{
crawl_page
(
nc
->
mgr
,
caps
[
0
].
ptr
,
caps
[
0
].
len
,
ud
->
depth
+
1
);
cursor
+=
offset
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment