Skip to content

Commit 420a424

Browse files
committed
Correct inconsistencies with Rust-based parsing
Prior to this, the parsing behaved differently from the Python-based parsing in three ways: 1. The line contents only included up to the last matched token, meaning comments afterwards were excluded. 2. The type_checking conditional did not work for an alias of the `typing` module, e.g. `if t.TYPE_CHECKING`. 3. Non-ascii Python identifiers weren't supported. This commit fixes both of those.
1 parent 95a5b8f commit 420a424

4 files changed

Lines changed: 96 additions & 9 deletions

File tree

rust/Cargo.lock

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

rust/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ pyimportparse = "0.2.1"
2626
nom = "8.0.0"
2727
nom_locate = "5.0.0"
2828
nom-regex = "0.2.0"
29+
nom-unicode = "0.4.0"
2930

3031
[dependencies.pyo3]
3132
version = "0.24.1"

rust/src/parsing.rs

Lines changed: 64 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,9 @@
2727
use nom::branch::alt;
2828
use nom::bytes::complete::{tag, take_until};
2929
use nom::character::complete::{
30-
alphanumeric1, line_ending, multispace1, not_line_ending, space0, space1,
30+
line_ending, multispace1, not_line_ending, space0, space1,
3131
};
32+
use nom_unicode::complete::alphanumeric1;
3233
use nom::combinator::{all_consuming, opt, recognize, value, verify};
3334
use nom::multi::{many0, many1, separated_list1};
3435
use nom::sequence::{delimited, preceded, terminated};
@@ -61,12 +62,28 @@ impl Import {
6162
}
6263
}
6364

64-
pub fn parse_imports(s: &str) -> Result<Vec<Import>, String> {
65-
let s = Span::new(s);
66-
let (_, result) = all_consuming(parse_block(false))
67-
.parse(s)
65+
pub fn parse_imports(python_file_contents: &str) -> Result<Vec<Import>, String> {
66+
let span = Span::new(python_file_contents);
67+
let (_, imports) = all_consuming(parse_block(false))
68+
.parse(span)
6869
.map_err(|e| e.to_string())?;
69-
Ok(result)
70+
Ok(with_corrected_line_contents(python_file_contents, imports))
71+
}
72+
73+
// Return the imports, but with the full line contents.
74+
// Currently our nom parsing only pulls out to the latest token, so we correct it here
75+
// by finding the corresponding whole line, based on the line number.
76+
// TODO: adjust the parsing code so it figures it out correctly in the first place.
77+
fn with_corrected_line_contents(python_file_contents: &str, imports: Vec<Import>) -> Vec<Import> {
78+
let lines: Vec<&str> = python_file_contents.lines().collect();
79+
imports.into_iter().map(
80+
|import| Import {
81+
imported_object: import.imported_object,
82+
line_number: import.line_number,
83+
line_contents: lines[(import.line_number as usize) - 1].trim_start().to_string(),
84+
typechecking_only: import.typechecking_only,
85+
}
86+
).collect()
7087
}
7188

7289
fn parse_block(typechecking_only: bool) -> impl Fn(Span) -> IResult<Span, Vec<Import>> {
@@ -276,8 +293,18 @@ fn parse_relative_module(s: Span) -> IResult<Span, &str> {
276293
Ok((s, result.fragment()))
277294
}
278295

296+
297+
// Parse a valid Python identifier.
298+
//
299+
// Note this is not implemented as thoroughly as in the Python spec.
300+
// Some identifiers will be valid here (e.g. ones that begin with digits)
301+
// that aren't actually valid in Python. Unicode identifiers are supported.
302+
//
303+
// See https://docs.python.org/3/reference/lexical_analysis.html#identifiers
279304
fn parse_identifier(s: Span) -> IResult<Span, &str> {
280-
let (s, result) = recognize(many1(alt((alphanumeric1, tag("_"))))).parse(s)?;
305+
let (s, result) = recognize(
306+
many1(alt((alphanumeric1, tag("_"))))
307+
).parse(s)?;
281308
Ok((s, result.fragment()))
282309
}
283310

@@ -310,11 +337,17 @@ fn parse_space1(s: Span) -> IResult<Span, ()> {
310337
Ok((s, ()))
311338
}
312339

340+
313341
fn parse_if_typechecking(s: Span) -> IResult<Span, Vec<Import>> {
314342
let (s, _) = (
315343
tag("if"),
316344
parse_space1,
317-
alt((tag("TYPE_CHECKING"), tag("typing.TYPE_CHECKING"))),
345+
alt(
346+
(
347+
tag("TYPE_CHECKING"),
348+
preceded(parse_identifier, tag(".TYPE_CHECKING")),
349+
),
350+
),
318351
parse_space0,
319352
tag(":"),
320353
)
@@ -575,6 +608,18 @@ import baz
575608
576609
(r#"
577610
import foo
611+
if t.TYPE_CHECKING: import bar
612+
import baz
613+
"#, &[("foo", false), ("bar", true), ("baz", false)]),
614+
615+
(r#"
616+
import foo
617+
if some_WE1RD_alias.TYPE_CHECKING: import bar
618+
import baz
619+
"#, &[("foo", false), ("bar", true), ("baz", false)]),
620+
621+
(r#"
622+
import foo
578623
if TYPE_CHECKING : import bar
579624
import baz
580625
"#, &[("foo", false), ("bar", true), ("baz", false)]),
@@ -734,17 +779,27 @@ if TYPE_CHECKING:
734779
let imports = parse_imports(
735780
"
736781
import a
782+
import a.b # Comment afterwards.
737783
from b import c
738784
from d import (e)
739-
from f import *",
785+
from f import *
786+
from something.foo import * # Comment afterwards.
787+
if True:
788+
from indented import foo
789+
from ñon_ascii_变 import ラーメン
790+
",
740791
)
741792
.unwrap();
742793
assert_eq!(
743794
vec![
744795
("a".to_owned(), "import a".to_owned()),
796+
("a.b".to_owned(), "import a.b # Comment afterwards.".to_owned()),
745797
("b.c".to_owned(), "from b import c".to_owned()),
746798
("d.e".to_owned(), "from d import (e)".to_owned()),
747799
("f.*".to_owned(), "from f import *".to_owned()),
800+
("something.foo.*".to_owned(), "from something.foo import * # Comment afterwards.".to_owned()),
801+
("indented.foo".to_owned(), "from indented import foo".to_owned()),
802+
("ñon_ascii_变.ラーメン".to_owned(), "from ñon_ascii_变 import ラーメン".to_owned()),
748803
],
749804
imports
750805
.into_iter()

tests/unit/adaptors/test_importscanner.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -854,6 +854,9 @@ def test_get_raw_imports():
854854
from . import g
855855
from .. import h
856856
from i import *
857+
from ñon_ascii_变 import *
858+
from . import ñon_ascii_变
859+
import ñon_ascii_变.ラーメン
857860
"""
858861

859862
raw_imported_objects = ImportScanner._get_raw_imported_objects(module_contents)
@@ -901,6 +904,24 @@ def test_get_raw_imports():
901904
line_contents="from i import *",
902905
typechecking_only=False,
903906
),
907+
_ImportedObject(
908+
name="ñon_ascii_变.*",
909+
line_number=9,
910+
line_contents="from ñon_ascii_变 import *",
911+
typechecking_only=False,
912+
),
913+
_ImportedObject(
914+
name=".ñon_ascii_变",
915+
line_number=10,
916+
line_contents="from . import ñon_ascii_变",
917+
typechecking_only=False,
918+
),
919+
_ImportedObject(
920+
name="ñon_ascii_变.ラーメン",
921+
line_number=11,
922+
line_contents="import ñon_ascii_变.ラーメン",
923+
typechecking_only=False,
924+
),
904925
}
905926

906927

0 commit comments

Comments
 (0)