Skip to content

Commit 1c854af

Browse files
perf(client): optimize file structure copying in multipart requests
1 parent 4985a34 commit 1c854af

10 files changed

Lines changed: 191 additions & 103 deletions

File tree

src/cas_parser/_files.py

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
import io
44
import os
55
import pathlib
6-
from typing import overload
7-
from typing_extensions import TypeGuard
6+
from typing import Sequence, cast, overload
7+
from typing_extensions import TypeVar, TypeGuard
88

99
import anyio
1010

@@ -17,7 +17,9 @@
1717
HttpxFileContent,
1818
HttpxRequestFiles,
1919
)
20-
from ._utils import is_tuple_t, is_mapping_t, is_sequence_t
20+
from ._utils import is_list, is_mapping, is_tuple_t, is_mapping_t, is_sequence_t
21+
22+
_T = TypeVar("_T")
2123

2224

2325
def is_base64_file_input(obj: object) -> TypeGuard[Base64FileInput]:
@@ -121,3 +123,51 @@ async def async_read_file_content(file: FileContent) -> HttpxFileContent:
121123
return await anyio.Path(file).read_bytes()
122124

123125
return file
126+
127+
128+
def deepcopy_with_paths(item: _T, paths: Sequence[Sequence[str]]) -> _T:
129+
"""Copy only the containers along the given paths.
130+
131+
Used to guard against mutation by extract_files without copying the entire structure.
132+
Only dicts and lists that lie on a path are copied; everything else
133+
is returned by reference.
134+
135+
For example, given paths=[["foo", "files", "file"]] and the structure:
136+
{
137+
"foo": {
138+
"bar": {"baz": {}},
139+
"files": {"file": <content>}
140+
}
141+
}
142+
The root dict, "foo", and "files" are copied (they lie on the path).
143+
"bar" and "baz" are returned by reference (off the path).
144+
"""
145+
return _deepcopy_with_paths(item, paths, 0)
146+
147+
148+
def _deepcopy_with_paths(item: _T, paths: Sequence[Sequence[str]], index: int) -> _T:
149+
if not paths:
150+
return item
151+
if is_mapping(item):
152+
key_to_paths: dict[str, list[Sequence[str]]] = {}
153+
for path in paths:
154+
if index < len(path):
155+
key_to_paths.setdefault(path[index], []).append(path)
156+
157+
# if no path continues through this mapping, it won't be mutated and copying it is redundant
158+
if not key_to_paths:
159+
return item
160+
161+
result = dict(item)
162+
for key, subpaths in key_to_paths.items():
163+
if key in result:
164+
result[key] = _deepcopy_with_paths(result[key], subpaths, index + 1)
165+
return cast(_T, result)
166+
if is_list(item):
167+
array_paths = [path for path in paths if index < len(path) and path[index] == "<array>"]
168+
169+
# if no path expects a list here, nothing will be mutated inside it - return by reference
170+
if not array_paths:
171+
return cast(_T, item)
172+
return cast(_T, [_deepcopy_with_paths(entry, array_paths, index + 1) for entry in item])
173+
return item

src/cas_parser/_utils/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
coerce_integer as coerce_integer,
2525
file_from_path as file_from_path,
2626
strip_not_given as strip_not_given,
27-
deepcopy_minimal as deepcopy_minimal,
2827
get_async_library as get_async_library,
2928
maybe_coerce_float as maybe_coerce_float,
3029
get_required_header as get_required_header,

src/cas_parser/_utils/_utils.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -177,21 +177,6 @@ def is_iterable(obj: object) -> TypeGuard[Iterable[object]]:
177177
return isinstance(obj, Iterable)
178178

179179

180-
def deepcopy_minimal(item: _T) -> _T:
181-
"""Minimal reimplementation of copy.deepcopy() that will only copy certain object types:
182-
183-
- mappings, e.g. `dict`
184-
- list
185-
186-
This is done for performance reasons.
187-
"""
188-
if is_mapping(item):
189-
return cast(_T, {k: deepcopy_minimal(v) for k, v in item.items()})
190-
if is_list(item):
191-
return cast(_T, [deepcopy_minimal(entry) for entry in item])
192-
return item
193-
194-
195180
# copied from https://github.com/Rapptz/RoboDanny
196181
def human_join(seq: Sequence[str], *, delim: str = ", ", final: str = "or") -> str:
197182
size = len(seq)

src/cas_parser/resources/cams_kfintech.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@
77
import httpx
88

99
from ..types import cams_kfintech_parse_params
10+
from .._files import deepcopy_with_paths
1011
from .._types import Body, Omit, Query, Headers, NotGiven, omit, not_given
11-
from .._utils import extract_files, maybe_transform, deepcopy_minimal, async_maybe_transform
12+
from .._utils import extract_files, maybe_transform, async_maybe_transform
1213
from .._compat import cached_property
1314
from .._resource import SyncAPIResource, AsyncAPIResource
1415
from .._response import (
@@ -78,12 +79,13 @@ def parse(
7879
7980
timeout: Override the client-level default timeout for this request, in seconds
8081
"""
81-
body = deepcopy_minimal(
82+
body = deepcopy_with_paths(
8283
{
8384
"password": password,
8485
"pdf_file": pdf_file,
8586
"pdf_url": pdf_url,
86-
}
87+
},
88+
[["pdf_file"]],
8789
)
8890
files = extract_files(cast(Mapping[str, object], body), paths=[["pdf_file"]])
8991
if files:
@@ -157,12 +159,13 @@ async def parse(
157159
158160
timeout: Override the client-level default timeout for this request, in seconds
159161
"""
160-
body = deepcopy_minimal(
162+
body = deepcopy_with_paths(
161163
{
162164
"password": password,
163165
"pdf_file": pdf_file,
164166
"pdf_url": pdf_url,
165-
}
167+
},
168+
[["pdf_file"]],
166169
)
167170
files = extract_files(cast(Mapping[str, object], body), paths=[["pdf_file"]])
168171
if files:

src/cas_parser/resources/cdsl/cdsl.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@
1515
AsyncFetchResourceWithStreamingResponse,
1616
)
1717
from ...types import cdsl_parse_pdf_params
18+
from ..._files import deepcopy_with_paths
1819
from ..._types import Body, Omit, Query, Headers, NotGiven, omit, not_given
19-
from ..._utils import extract_files, maybe_transform, deepcopy_minimal, async_maybe_transform
20+
from ..._utils import extract_files, maybe_transform, async_maybe_transform
2021
from ..._compat import cached_property
2122
from ..._resource import SyncAPIResource, AsyncAPIResource
2223
from ..._response import (
@@ -94,12 +95,13 @@ def parse_pdf(
9495
9596
timeout: Override the client-level default timeout for this request, in seconds
9697
"""
97-
body = deepcopy_minimal(
98+
body = deepcopy_with_paths(
9899
{
99100
"password": password,
100101
"pdf_file": pdf_file,
101102
"pdf_url": pdf_url,
102-
}
103+
},
104+
[["pdf_file"]],
103105
)
104106
files = extract_files(cast(Mapping[str, object], body), paths=[["pdf_file"]])
105107
if files:
@@ -181,12 +183,13 @@ async def parse_pdf(
181183
182184
timeout: Override the client-level default timeout for this request, in seconds
183185
"""
184-
body = deepcopy_minimal(
186+
body = deepcopy_with_paths(
185187
{
186188
"password": password,
187189
"pdf_file": pdf_file,
188190
"pdf_url": pdf_url,
189-
}
191+
},
192+
[["pdf_file"]],
190193
)
191194
files = extract_files(cast(Mapping[str, object], body), paths=[["pdf_file"]])
192195
if files:

src/cas_parser/resources/contract_note.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88
import httpx
99

1010
from ..types import contract_note_parse_params
11+
from .._files import deepcopy_with_paths
1112
from .._types import Body, Omit, Query, Headers, NotGiven, omit, not_given
12-
from .._utils import extract_files, maybe_transform, deepcopy_minimal, async_maybe_transform
13+
from .._utils import extract_files, maybe_transform, async_maybe_transform
1314
from .._compat import cached_property
1415
from .._resource import SyncAPIResource, AsyncAPIResource
1516
from .._response import (
@@ -110,13 +111,14 @@ def parse(
110111
111112
timeout: Override the client-level default timeout for this request, in seconds
112113
"""
113-
body = deepcopy_minimal(
114+
body = deepcopy_with_paths(
114115
{
115116
"broker_type": broker_type,
116117
"password": password,
117118
"pdf_file": pdf_file,
118119
"pdf_url": pdf_url,
119-
}
120+
},
121+
[["pdf_file"]],
120122
)
121123
files = extract_files(cast(Mapping[str, object], body), paths=[["pdf_file"]])
122124
if files:
@@ -221,13 +223,14 @@ async def parse(
221223
222224
timeout: Override the client-level default timeout for this request, in seconds
223225
"""
224-
body = deepcopy_minimal(
226+
body = deepcopy_with_paths(
225227
{
226228
"broker_type": broker_type,
227229
"password": password,
228230
"pdf_file": pdf_file,
229231
"pdf_url": pdf_url,
230-
}
232+
},
233+
[["pdf_file"]],
231234
)
232235
files = extract_files(cast(Mapping[str, object], body), paths=[["pdf_file"]])
233236
if files:

src/cas_parser/resources/nsdl.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@
77
import httpx
88

99
from ..types import nsdl_parse_params
10+
from .._files import deepcopy_with_paths
1011
from .._types import Body, Omit, Query, Headers, NotGiven, omit, not_given
11-
from .._utils import extract_files, maybe_transform, deepcopy_minimal, async_maybe_transform
12+
from .._utils import extract_files, maybe_transform, async_maybe_transform
1213
from .._compat import cached_property
1314
from .._resource import SyncAPIResource, AsyncAPIResource
1415
from .._response import (
@@ -78,12 +79,13 @@ def parse(
7879
7980
timeout: Override the client-level default timeout for this request, in seconds
8081
"""
81-
body = deepcopy_minimal(
82+
body = deepcopy_with_paths(
8283
{
8384
"password": password,
8485
"pdf_file": pdf_file,
8586
"pdf_url": pdf_url,
86-
}
87+
},
88+
[["pdf_file"]],
8789
)
8890
files = extract_files(cast(Mapping[str, object], body), paths=[["pdf_file"]])
8991
if files:
@@ -157,12 +159,13 @@ async def parse(
157159
158160
timeout: Override the client-level default timeout for this request, in seconds
159161
"""
160-
body = deepcopy_minimal(
162+
body = deepcopy_with_paths(
161163
{
162164
"password": password,
163165
"pdf_file": pdf_file,
164166
"pdf_url": pdf_url,
165-
}
167+
},
168+
[["pdf_file"]],
166169
)
167170
files = extract_files(cast(Mapping[str, object], body), paths=[["pdf_file"]])
168171
if files:

src/cas_parser/resources/smart.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@
77
import httpx
88

99
from ..types import smart_parse_cas_pdf_params
10+
from .._files import deepcopy_with_paths
1011
from .._types import Body, Omit, Query, Headers, NotGiven, omit, not_given
11-
from .._utils import extract_files, maybe_transform, deepcopy_minimal, async_maybe_transform
12+
from .._utils import extract_files, maybe_transform, async_maybe_transform
1213
from .._compat import cached_property
1314
from .._resource import SyncAPIResource, AsyncAPIResource
1415
from .._response import (
@@ -79,12 +80,13 @@ def parse_cas_pdf(
7980
8081
timeout: Override the client-level default timeout for this request, in seconds
8182
"""
82-
body = deepcopy_minimal(
83+
body = deepcopy_with_paths(
8384
{
8485
"password": password,
8586
"pdf_file": pdf_file,
8687
"pdf_url": pdf_url,
87-
}
88+
},
89+
[["pdf_file"]],
8890
)
8991
files = extract_files(cast(Mapping[str, object], body), paths=[["pdf_file"]])
9092
if files:
@@ -159,12 +161,13 @@ async def parse_cas_pdf(
159161
160162
timeout: Override the client-level default timeout for this request, in seconds
161163
"""
162-
body = deepcopy_minimal(
164+
body = deepcopy_with_paths(
163165
{
164166
"password": password,
165167
"pdf_file": pdf_file,
166168
"pdf_url": pdf_url,
167-
}
169+
},
170+
[["pdf_file"]],
168171
)
169172
files = extract_files(cast(Mapping[str, object], body), paths=[["pdf_file"]])
170173
if files:

tests/test_deepcopy.py

Lines changed: 0 additions & 58 deletions
This file was deleted.

0 commit comments

Comments
 (0)