Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in
Toggle navigation
M
mongoose
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esp
mongoose
Commits
ff159bf3
Commit
ff159bf3
authored
8 years ago
by
Artem Bulavin
Committed by
Cesanta Bot
8 years ago
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Simple mongoose http urls crawler
PUBLISHED_FROM=4eead54610606827963e7c244fcd8ab9a13d4c07
parent
8cdd19bc
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
94 additions
and
0 deletions
+94
-0
Makefile
examples/simple_crawler/Makefile
+3
-0
simple_crawler.c
examples/simple_crawler/simple_crawler.c
+91
-0
No files found.
examples/simple_crawler/Makefile
0 → 100644
View file @
ff159bf3
PROG
=
simple_crawler
MODULE_CFLAGS
=
../../../slre/slre.c
include
../examples.mk
This diff is collapsed.
Click to expand it.
examples/simple_crawler/simple_crawler.c
0 → 100644
View file @
ff159bf3
#include <stdio.h>
#include <string.h>
#include "mongoose.h"
#include "../../../slre/slre.h"
static
const
char
*
regex
=
"href=
\"
((https?://)[^
\\
s/'
\"
<>]+/?[^
\\
s'
\"
<>]*)"
;
const
int
max_depth
=
2
;
struct
userdata
{
char
*
url
;
int
depth
;
};
void
crawl_page
(
struct
mg_mgr
*
mgr
,
const
char
*
url
,
size_t
url_len
,
int
depth
);
void
handle_reply
(
struct
mg_connection
*
nc
,
struct
http_message
*
hm
);
static
void
event_handler
(
struct
mg_connection
*
nc
,
int
event
,
void
*
data
)
{
struct
http_message
*
hm
=
(
struct
http_message
*
)
data
;
int
connect_status
;
switch
(
event
)
{
case
MG_EV_CONNECT
:
connect_status
=
*
(
int
*
)
data
;
if
(
connect_status
!=
0
)
{
printf
(
"Error while loading page: %s, error: %s
\n
"
,
((
struct
userdata
*
)
nc
->
user_data
)
->
url
,
strerror
(
connect_status
));
}
break
;
case
MG_EV_CLOSE
:
free
(((
struct
userdata
*
)
nc
->
user_data
)
->
url
);
free
(
nc
->
user_data
);
break
;
case
MG_EV_HTTP_REPLY
:
handle_reply
(
nc
,
hm
);
nc
->
flags
|=
MG_F_SEND_AND_CLOSE
;
break
;
default:
break
;
}
}
int
main
()
{
struct
mg_mgr
mgr
;
mg_mgr_init
(
&
mgr
,
NULL
);
crawl_page
(
&
mgr
,
"http://www.simpleweb.org/"
,
~
0
,
0
);
for
(;;)
{
mg_mgr_poll
(
&
mgr
,
1000
);
}
mg_mgr_free
(
&
mgr
);
return
0
;
}
void
crawl_page
(
struct
mg_mgr
*
mgr
,
const
char
*
url
,
size_t
url_len
,
int
depth
)
{
struct
mg_connection
*
nc
;
struct
userdata
*
data
=
malloc
(
sizeof
(
struct
userdata
));
if
(
url_len
==
(
size_t
)
~
0
)
{
url_len
=
strlen
(
url
);
}
data
->
url
=
strncpy
(
malloc
(
url_len
+
1
),
url
,
url_len
);
data
->
url
[
url_len
]
=
'\0'
;
data
->
depth
=
depth
;
nc
=
mg_connect_http
(
mgr
,
event_handler
,
url
,
NULL
,
NULL
);
nc
->
user_data
=
data
;
}
void
handle_reply
(
struct
mg_connection
*
nc
,
struct
http_message
*
hm
)
{
struct
userdata
*
ud
=
(
struct
userdata
*
)
nc
->
user_data
;
const
char
*
body
=
hm
->
body
.
p
;
int
offset
,
max_matches
=
2
,
cursor
=
0
,
str_len
=
strlen
(
body
);
struct
slre_cap
caps
[
max_matches
];
printf
(
"Loaded url: %s at depth %d
\n
"
,
ud
->
url
,
ud
->
depth
);
if
(
ud
->
depth
==
max_depth
)
{
return
;
}
while
(
cursor
<
str_len
&&
(
offset
=
slre_match
(
regex
,
body
+
cursor
,
str_len
-
cursor
,
caps
,
max_matches
,
SLRE_IGNORE_CASE
))
>
0
)
{
crawl_page
(
nc
->
mgr
,
caps
[
0
].
ptr
,
caps
[
0
].
len
,
ud
->
depth
+
1
);
cursor
+=
offset
;
}
}
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment